Added more complex indexing and search query.

This commit is contained in:
Andrew Lalis 2023-10-09 20:59:03 -04:00
parent ddc69c1c68
commit 07be6b3878
7 changed files with 122 additions and 19 deletions

View File

@ -5,3 +5,12 @@ An indexer and search API for D programming language packages as registered on h
## Setup ## Setup
To set up and run the program, all you need is Java version 21 or higher, and then run the project using your favorite IDE. It will boot up a web server that you can use to search for packages at http://localhost:8080/search?query=test, replacing `query=test` with what you want to search for. To set up and run the program, all you need is Java version 21 or higher, and then run the project using your favorite IDE. It will boot up a web server that you can use to search for packages at http://localhost:8080/search?query=test, replacing `query=test` with what you want to search for.
## Architecture
The basic architecture of this searcher is that of your classic indexed search engine, which is usually comprised of the following steps:
1. Fetch raw data from somewhere.
2. Generate an index from that data.
3. Search for relevant data using the index.
In this application, steps 1 and 2 are done periodically in a separate thread, to ensure that the data stays relatively fresh. Step 3 is done whenever a request to the `/search` endpoint is received.

View File

@ -28,7 +28,7 @@ public class DPackageSearch {
while (true) { while (true) {
indexGenerator.run(); indexGenerator.run();
try { try {
Thread.sleep(Duration.ofMinutes(1)); Thread.sleep(Duration.ofMinutes(5));
} catch (InterruptedException e) { } catch (InterruptedException e) {
System.err.println("Indexing thread interrupted: " + e.getMessage()); System.err.println("Indexing thread interrupted: " + e.getMessage());
break; break;

View File

@ -7,11 +7,15 @@ import java.time.LocalDateTime;
* @param name The name of the package. * @param name The name of the package.
* @param categories The list of categories the package is in. * @param categories The list of categories the package is in.
* @param versions The known list of versions for this package. * @param versions The known list of versions for this package.
* @param fetchedAt Timestamp for when this package was fetched exactly, so
* that later indexing can use a unified timestamp for reference.
*/ */
public record PackageInfo( public record PackageInfo(
String name, String name,
String[] categories, String[] categories,
VersionInfo[] versions VersionInfo[] versions,
long totalDownloads,
LocalDateTime fetchedAt
) { ) {
/** /**
* Information about a specific version of a D package. * Information about a specific version of a D package.

View File

@ -79,7 +79,7 @@ public final class WebApiRunner extends Handler.Abstract implements Runnable {
if (idx != -1) { if (idx != -1) {
String key = pair.substring(0, idx); String key = pair.substring(0, idx);
if (key.trim().equalsIgnoreCase("query")) { if (key.trim().equalsIgnoreCase("query")) {
return pair.substring(idx + 1).trim().toUpperCase(); return pair.substring(idx + 1).trim().toLowerCase();
} }
} }
} }

View File

@ -43,6 +43,7 @@ public class DubRegistryPackageFetcher implements PackageFetcher {
if (response.statusCode() != 200) { if (response.statusCode() != 200) {
throw new IOException("Response status code " + response.statusCode()); throw new IOException("Response status code " + response.statusCode());
} }
LocalDateTime fetchedAt = LocalDateTime.now(ZoneOffset.UTC);
ObjectMapper mapper = new ObjectMapper(); ObjectMapper mapper = new ObjectMapper();
try (var in = new GZIPInputStream(response.body())) { try (var in = new GZIPInputStream(response.body())) {
ArrayNode array = mapper.readValue(in, ArrayNode.class); ArrayNode array = mapper.readValue(in, ArrayNode.class);
@ -50,7 +51,7 @@ public class DubRegistryPackageFetcher implements PackageFetcher {
for (JsonNode node : array) { for (JsonNode node : array) {
if (node.isObject()) { if (node.isObject()) {
try { try {
packages.add(parsePackage((ObjectNode) node)); packages.add(parsePackage((ObjectNode) node, fetchedAt));
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
} }
@ -63,11 +64,13 @@ public class DubRegistryPackageFetcher implements PackageFetcher {
} }
} }
private PackageInfo parsePackage(ObjectNode obj) { private PackageInfo parsePackage(ObjectNode obj, LocalDateTime fetchedAt) {
return new PackageInfo( return new PackageInfo(
obj.get("name").asText(), obj.get("name").asText(),
mapJsonArray(obj.withArray("categories"), JsonNode::asText).toArray(new String[0]), mapJsonArray(obj.withArray("categories"), JsonNode::asText).toArray(new String[0]),
mapJsonArray(obj.withArray("versions"), this::parseVersion).toArray(new PackageInfo.VersionInfo[0]) mapJsonArray(obj.withArray("versions"), this::parseVersion).toArray(new PackageInfo.VersionInfo[0]),
obj.get("stats").get("downloads").get("total").asLong(),
fetchedAt
); );
} }
@ -97,6 +100,13 @@ public class DubRegistryPackageFetcher implements PackageFetcher {
); );
} }
/**
* Maps a JSON array to a list of objects, using a mapping function.
* @param array The JSON array.
* @param mapper The mapper function to apply to each element of the array.
* @return The mapped list of objects.
* @param <T> The type of the resultant list elements.
*/
private static <T> List<T> mapJsonArray(ArrayNode array, Function<JsonNode, T> mapper) { private static <T> List<T> mapJsonArray(ArrayNode array, Function<JsonNode, T> mapper) {
List<T> list = new ArrayList<>(array.size()); List<T> list = new ArrayList<>(array.size());
for (JsonNode node : array) { for (JsonNode node : array) {

View File

@ -4,18 +4,23 @@ import com.andrewlalis.d_package_search.PackageIndexer;
import com.andrewlalis.d_package_search.PackageInfo; import com.andrewlalis.d_package_search.PackageInfo;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document; import org.apache.lucene.document.*;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexableFieldType;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.FSDirectory;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Path; import java.nio.file.Path;
import java.time.Duration;
import java.time.ZoneOffset;
import java.util.*;
/**
* An indexer that produces a Lucene index, which is a directory, composed of
* possibly many index segments.
*/
public class LucenePackageIndexer implements PackageIndexer { public class LucenePackageIndexer implements PackageIndexer {
private final IndexWriter indexWriter; private final IndexWriter indexWriter;
private final Directory dir; private final Directory dir;
@ -30,13 +35,70 @@ public class LucenePackageIndexer implements PackageIndexer {
this.indexWriter = new IndexWriter(dir, config); this.indexWriter = new IndexWriter(dir, config);
} }
/**
* Adds a package to the Lucene index. This is the central place where the
* index's fields are defined. We define the following fields:
* <ul>
* <li>name (text, stored)</li>
* <li>url (stored only)</li>
* <li>categories (multivalued string field with value for each category).</li>
* <li>latestVersionTimestamp (string field with date of latest version).</li>
* <li>description (optional text field)</li>
* <li>license (optional string field)</li>
* <li>readme (optional text field)</li>
* <li>
* features (feature field with the following features useful for scoring)
* <ul>
* <li>recency (0 - 1 value indicating how recent the package is)</li>
* <li>downloads (total downloads for the package)</li>
* </ul>
* </li>
* </ul>
* @param info The package to index.
* @throws IOException If an error occurs.
*/
@Override @Override
public void addToIndex(PackageInfo info) throws IOException { public void addToIndex(PackageInfo info) throws IOException {
if (info.versions().length == 0) {
System.out.println("Skipping package \"" + info.name() + "\" because there are no versions available.");
return;
}
System.out.println("Indexing package \"" + info.name() + "\".");
String dubUrl = "https://code.dlang.org/packages/" + info.name(); String dubUrl = "https://code.dlang.org/packages/" + info.name();
List<PackageInfo.VersionInfo> allVersions = new ArrayList<>(Arrays.asList(info.versions()));
allVersions.sort(Comparator.comparing(PackageInfo.VersionInfo::timestamp).reversed());
var recentVersions = allVersions.subList(0, Math.min(5, allVersions.size()));
Document doc = new Document(); Document doc = new Document();
doc.add(new TextField("name", info.name(), Field.Store.YES)); doc.add(new TextField("name", info.name(), Field.Store.YES));
doc.add(new StoredField("url", dubUrl)); doc.add(new StoredField("url", dubUrl));
for (String category : info.categories()) {
doc.add(new StringField("categories", category, Field.Store.NO));
}
PackageInfo.VersionInfo latestVersion = recentVersions.getFirst();
doc.add(new StringField(
"latestVersionTimestamp",
DateTools.dateToString(Date.from(latestVersion.timestamp().toInstant(ZoneOffset.UTC)), DateTools.Resolution.SECOND),
Field.Store.NO
));
if (latestVersion.description() != null) {
doc.add(new TextField("description", latestVersion.description(), Field.Store.NO));
}
if (latestVersion.license() != null) {
doc.add(new StringField("license", latestVersion.license(), Field.Store.NO));
}
if (latestVersion.readmeText() != null) {
doc.add(new TextField("readme", latestVersion.readmeText(), Field.Store.NO));
}
// Add FeatureFields to score packages based on some metrics.
int daysSinceUpdate = Math.clamp(Duration.between(latestVersion.timestamp(), info.fetchedAt()).toDays(), 1, 365 * 3);
float recency = 1f / daysSinceUpdate;
float downloadsScore = Math.clamp(info.totalDownloads(), 0.001f, Float.MAX_VALUE);
doc.add(new FeatureField("features", "recency", recency));
doc.add(new FeatureField("features", "downloads", downloadsScore));
indexWriter.addDocument(doc); indexWriter.addDocument(doc);
} }

View File

@ -3,6 +3,7 @@ package com.andrewlalis.d_package_search.impl;
import com.andrewlalis.d_package_search.PackageSearchResult; import com.andrewlalis.d_package_search.PackageSearchResult;
import com.andrewlalis.d_package_search.PackageSearcher; import com.andrewlalis.d_package_search.PackageSearcher;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.FeatureField;
import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.search.*; import org.apache.lucene.search.*;
@ -11,12 +12,13 @@ import org.apache.lucene.store.FSDirectory;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.ArrayList; import java.util.*;
import java.util.Collections;
import java.util.List;
import java.util.SequencedCollection;
import java.util.concurrent.Executors; import java.util.concurrent.Executors;
/**
* A package searcher implementation that uses a weighted wildcard query to
* search a Lucene index.
*/
public class LucenePackageSearcher implements PackageSearcher { public class LucenePackageSearcher implements PackageSearcher {
private final Path indexPath; private final Path indexPath;
@ -52,12 +54,28 @@ public class LucenePackageSearcher implements PackageSearcher {
private Query buildQuery(String queryText) { private Query buildQuery(String queryText) {
BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder(); BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder();
String[] searchTerms = queryText.toLowerCase().split("\\s+"); String[] searchTerms = queryText.toLowerCase().split("\\s+");
for (String searchTerm : searchTerms) {
String wildcardTerm = searchTerm + "*"; Map<String, Float> weightedFields = Map.of(
Query basicQuery = new WildcardQuery(new Term("name", wildcardTerm)); "name", 1f,
queryBuilder.add(new BoostQuery(basicQuery, 1f), BooleanClause.Occur.SHOULD); "description", 0.5f,
"readme", 0.25f
);
for (int i = 0; i < Math.min(5, searchTerms.length); i++) {
for (var entry : weightedFields.entrySet()) {
String fieldName = entry.getKey();
float fieldWeight = entry.getValue();
Query termQuery = new BoostQuery(new PrefixQuery(new Term(fieldName, searchTerms[i])), fieldWeight);
queryBuilder.add(termQuery, BooleanClause.Occur.SHOULD);
}
} }
return queryBuilder.build(); Query baseQuery = queryBuilder.build();
Query boostedQuery = new BooleanQuery.Builder()
.add(baseQuery, BooleanClause.Occur.MUST)
.add(FeatureField.newSaturationQuery("features", "recency"), BooleanClause.Occur.SHOULD)
.add(FeatureField.newSaturationQuery("features", "downloads"), BooleanClause.Occur.SHOULD)
.build();
return boostedQuery;
} }
private PackageSearchResult prepareResult(Document doc) { private PackageSearchResult prepareResult(Document doc) {