From 07be6b38785defe03627f1907990d6b12619d042 Mon Sep 17 00:00:00 2001 From: andrewlalis Date: Mon, 9 Oct 2023 20:59:03 -0400 Subject: [PATCH] Added more complex indexing and search query. --- README.md | 9 +++ .../d_package_search/DPackageSearch.java | 2 +- .../d_package_search/PackageInfo.java | 6 +- .../d_package_search/WebApiRunner.java | 2 +- .../impl/DubRegistryPackageFetcher.java | 16 ++++- .../impl/LucenePackageIndexer.java | 70 +++++++++++++++++-- .../impl/LucenePackageSearcher.java | 36 +++++++--- 7 files changed, 122 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 5072d4a..09068b2 100644 --- a/README.md +++ b/README.md @@ -5,3 +5,12 @@ An indexer and search API for D programming language packages as registered on h ## Setup To set up and run the program, all you need is Java version 21 or higher, and then run the project using your favorite IDE. It will boot up a web server that you can use to search for packages at http://localhost:8080/search?query=test, replacing `query=test` with what you want to search for. + +## Architecture + +The basic architecture of this searcher is that of your classic indexed search engine, which is usually comprised of the following steps: +1. Fetch raw data from somewhere. +2. Generate an index from that data. +3. Search for relevant data using the index. + +In this application, steps 1 and 2 are done periodically in a separate thread, to ensure that the data stays relatively fresh. Step 3 is done whenever a request to the `/search` endpoint is received. diff --git a/src/main/java/com/andrewlalis/d_package_search/DPackageSearch.java b/src/main/java/com/andrewlalis/d_package_search/DPackageSearch.java index f67836c..39e106a 100644 --- a/src/main/java/com/andrewlalis/d_package_search/DPackageSearch.java +++ b/src/main/java/com/andrewlalis/d_package_search/DPackageSearch.java @@ -28,7 +28,7 @@ public class DPackageSearch { while (true) { indexGenerator.run(); try { - Thread.sleep(Duration.ofMinutes(1)); + Thread.sleep(Duration.ofMinutes(5)); } catch (InterruptedException e) { System.err.println("Indexing thread interrupted: " + e.getMessage()); break; diff --git a/src/main/java/com/andrewlalis/d_package_search/PackageInfo.java b/src/main/java/com/andrewlalis/d_package_search/PackageInfo.java index ec40ff7..cc04b12 100644 --- a/src/main/java/com/andrewlalis/d_package_search/PackageInfo.java +++ b/src/main/java/com/andrewlalis/d_package_search/PackageInfo.java @@ -7,11 +7,15 @@ import java.time.LocalDateTime; * @param name The name of the package. * @param categories The list of categories the package is in. * @param versions The known list of versions for this package. + * @param fetchedAt Timestamp for when this package was fetched exactly, so + * that later indexing can use a unified timestamp for reference. */ public record PackageInfo( String name, String[] categories, - VersionInfo[] versions + VersionInfo[] versions, + long totalDownloads, + LocalDateTime fetchedAt ) { /** * Information about a specific version of a D package. diff --git a/src/main/java/com/andrewlalis/d_package_search/WebApiRunner.java b/src/main/java/com/andrewlalis/d_package_search/WebApiRunner.java index ddc9375..a613f08 100644 --- a/src/main/java/com/andrewlalis/d_package_search/WebApiRunner.java +++ b/src/main/java/com/andrewlalis/d_package_search/WebApiRunner.java @@ -79,7 +79,7 @@ public final class WebApiRunner extends Handler.Abstract implements Runnable { if (idx != -1) { String key = pair.substring(0, idx); if (key.trim().equalsIgnoreCase("query")) { - return pair.substring(idx + 1).trim().toUpperCase(); + return pair.substring(idx + 1).trim().toLowerCase(); } } } diff --git a/src/main/java/com/andrewlalis/d_package_search/impl/DubRegistryPackageFetcher.java b/src/main/java/com/andrewlalis/d_package_search/impl/DubRegistryPackageFetcher.java index 97ca795..c545856 100644 --- a/src/main/java/com/andrewlalis/d_package_search/impl/DubRegistryPackageFetcher.java +++ b/src/main/java/com/andrewlalis/d_package_search/impl/DubRegistryPackageFetcher.java @@ -43,6 +43,7 @@ public class DubRegistryPackageFetcher implements PackageFetcher { if (response.statusCode() != 200) { throw new IOException("Response status code " + response.statusCode()); } + LocalDateTime fetchedAt = LocalDateTime.now(ZoneOffset.UTC); ObjectMapper mapper = new ObjectMapper(); try (var in = new GZIPInputStream(response.body())) { ArrayNode array = mapper.readValue(in, ArrayNode.class); @@ -50,7 +51,7 @@ public class DubRegistryPackageFetcher implements PackageFetcher { for (JsonNode node : array) { if (node.isObject()) { try { - packages.add(parsePackage((ObjectNode) node)); + packages.add(parsePackage((ObjectNode) node, fetchedAt)); } catch (Exception e) { e.printStackTrace(); } @@ -63,11 +64,13 @@ public class DubRegistryPackageFetcher implements PackageFetcher { } } - private PackageInfo parsePackage(ObjectNode obj) { + private PackageInfo parsePackage(ObjectNode obj, LocalDateTime fetchedAt) { return new PackageInfo( obj.get("name").asText(), mapJsonArray(obj.withArray("categories"), JsonNode::asText).toArray(new String[0]), - mapJsonArray(obj.withArray("versions"), this::parseVersion).toArray(new PackageInfo.VersionInfo[0]) + mapJsonArray(obj.withArray("versions"), this::parseVersion).toArray(new PackageInfo.VersionInfo[0]), + obj.get("stats").get("downloads").get("total").asLong(), + fetchedAt ); } @@ -97,6 +100,13 @@ public class DubRegistryPackageFetcher implements PackageFetcher { ); } + /** + * Maps a JSON array to a list of objects, using a mapping function. + * @param array The JSON array. + * @param mapper The mapper function to apply to each element of the array. + * @return The mapped list of objects. + * @param The type of the resultant list elements. + */ private static List mapJsonArray(ArrayNode array, Function mapper) { List list = new ArrayList<>(array.size()); for (JsonNode node : array) { diff --git a/src/main/java/com/andrewlalis/d_package_search/impl/LucenePackageIndexer.java b/src/main/java/com/andrewlalis/d_package_search/impl/LucenePackageIndexer.java index bea2917..7c63cf2 100644 --- a/src/main/java/com/andrewlalis/d_package_search/impl/LucenePackageIndexer.java +++ b/src/main/java/com/andrewlalis/d_package_search/impl/LucenePackageIndexer.java @@ -4,18 +4,23 @@ import com.andrewlalis.d_package_search.PackageIndexer; import com.andrewlalis.d_package_search.PackageInfo; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.StoredField; -import org.apache.lucene.document.TextField; +import org.apache.lucene.document.*; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.IndexableFieldType; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import java.io.IOException; import java.nio.file.Path; +import java.time.Duration; +import java.time.ZoneOffset; +import java.util.*; +/** + * An indexer that produces a Lucene index, which is a directory, composed of + * possibly many index segments. + */ public class LucenePackageIndexer implements PackageIndexer { private final IndexWriter indexWriter; private final Directory dir; @@ -30,13 +35,70 @@ public class LucenePackageIndexer implements PackageIndexer { this.indexWriter = new IndexWriter(dir, config); } + /** + * Adds a package to the Lucene index. This is the central place where the + * index's fields are defined. We define the following fields: + *
    + *
  • name (text, stored)
  • + *
  • url (stored only)
  • + *
  • categories (multivalued string field with value for each category).
  • + *
  • latestVersionTimestamp (string field with date of latest version).
  • + *
  • description (optional text field)
  • + *
  • license (optional string field)
  • + *
  • readme (optional text field)
  • + *
  • + * features (feature field with the following features useful for scoring) + *
      + *
    • recency (0 - 1 value indicating how recent the package is)
    • + *
    • downloads (total downloads for the package)
    • + *
    + *
  • + *
+ * @param info The package to index. + * @throws IOException If an error occurs. + */ @Override public void addToIndex(PackageInfo info) throws IOException { + if (info.versions().length == 0) { + System.out.println("Skipping package \"" + info.name() + "\" because there are no versions available."); + return; + } + System.out.println("Indexing package \"" + info.name() + "\"."); String dubUrl = "https://code.dlang.org/packages/" + info.name(); + List allVersions = new ArrayList<>(Arrays.asList(info.versions())); + allVersions.sort(Comparator.comparing(PackageInfo.VersionInfo::timestamp).reversed()); + var recentVersions = allVersions.subList(0, Math.min(5, allVersions.size())); Document doc = new Document(); doc.add(new TextField("name", info.name(), Field.Store.YES)); doc.add(new StoredField("url", dubUrl)); + for (String category : info.categories()) { + doc.add(new StringField("categories", category, Field.Store.NO)); + } + + PackageInfo.VersionInfo latestVersion = recentVersions.getFirst(); + doc.add(new StringField( + "latestVersionTimestamp", + DateTools.dateToString(Date.from(latestVersion.timestamp().toInstant(ZoneOffset.UTC)), DateTools.Resolution.SECOND), + Field.Store.NO + )); + if (latestVersion.description() != null) { + doc.add(new TextField("description", latestVersion.description(), Field.Store.NO)); + } + if (latestVersion.license() != null) { + doc.add(new StringField("license", latestVersion.license(), Field.Store.NO)); + } + if (latestVersion.readmeText() != null) { + doc.add(new TextField("readme", latestVersion.readmeText(), Field.Store.NO)); + } + + // Add FeatureFields to score packages based on some metrics. + int daysSinceUpdate = Math.clamp(Duration.between(latestVersion.timestamp(), info.fetchedAt()).toDays(), 1, 365 * 3); + float recency = 1f / daysSinceUpdate; + float downloadsScore = Math.clamp(info.totalDownloads(), 0.001f, Float.MAX_VALUE); + doc.add(new FeatureField("features", "recency", recency)); + doc.add(new FeatureField("features", "downloads", downloadsScore)); + indexWriter.addDocument(doc); } diff --git a/src/main/java/com/andrewlalis/d_package_search/impl/LucenePackageSearcher.java b/src/main/java/com/andrewlalis/d_package_search/impl/LucenePackageSearcher.java index 9377491..8bc61fa 100644 --- a/src/main/java/com/andrewlalis/d_package_search/impl/LucenePackageSearcher.java +++ b/src/main/java/com/andrewlalis/d_package_search/impl/LucenePackageSearcher.java @@ -3,6 +3,7 @@ package com.andrewlalis.d_package_search.impl; import com.andrewlalis.d_package_search.PackageSearchResult; import com.andrewlalis.d_package_search.PackageSearcher; import org.apache.lucene.document.Document; +import org.apache.lucene.document.FeatureField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.Term; import org.apache.lucene.search.*; @@ -11,12 +12,13 @@ import org.apache.lucene.store.FSDirectory; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.SequencedCollection; +import java.util.*; import java.util.concurrent.Executors; +/** + * A package searcher implementation that uses a weighted wildcard query to + * search a Lucene index. + */ public class LucenePackageSearcher implements PackageSearcher { private final Path indexPath; @@ -52,12 +54,28 @@ public class LucenePackageSearcher implements PackageSearcher { private Query buildQuery(String queryText) { BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder(); String[] searchTerms = queryText.toLowerCase().split("\\s+"); - for (String searchTerm : searchTerms) { - String wildcardTerm = searchTerm + "*"; - Query basicQuery = new WildcardQuery(new Term("name", wildcardTerm)); - queryBuilder.add(new BoostQuery(basicQuery, 1f), BooleanClause.Occur.SHOULD); + + Map weightedFields = Map.of( + "name", 1f, + "description", 0.5f, + "readme", 0.25f + ); + + for (int i = 0; i < Math.min(5, searchTerms.length); i++) { + for (var entry : weightedFields.entrySet()) { + String fieldName = entry.getKey(); + float fieldWeight = entry.getValue(); + Query termQuery = new BoostQuery(new PrefixQuery(new Term(fieldName, searchTerms[i])), fieldWeight); + queryBuilder.add(termQuery, BooleanClause.Occur.SHOULD); + } } - return queryBuilder.build(); + Query baseQuery = queryBuilder.build(); + Query boostedQuery = new BooleanQuery.Builder() + .add(baseQuery, BooleanClause.Occur.MUST) + .add(FeatureField.newSaturationQuery("features", "recency"), BooleanClause.Occur.SHOULD) + .add(FeatureField.newSaturationQuery("features", "downloads"), BooleanClause.Occur.SHOULD) + .build(); + return boostedQuery; } private PackageSearchResult prepareResult(Document doc) {