Added more complex indexing and search query.

This commit is contained in:
Andrew Lalis 2023-10-09 20:59:03 -04:00
parent ddc69c1c68
commit 07be6b3878
7 changed files with 122 additions and 19 deletions

View File

@ -5,3 +5,12 @@ An indexer and search API for D programming language packages as registered on h
## Setup
To set up and run the program, all you need is Java version 21 or higher, and then run the project using your favorite IDE. It will boot up a web server that you can use to search for packages at http://localhost:8080/search?query=test, replacing `query=test` with what you want to search for.
## Architecture
The basic architecture of this searcher is that of your classic indexed search engine, which is usually comprised of the following steps:
1. Fetch raw data from somewhere.
2. Generate an index from that data.
3. Search for relevant data using the index.
In this application, steps 1 and 2 are done periodically in a separate thread, to ensure that the data stays relatively fresh. Step 3 is done whenever a request to the `/search` endpoint is received.

View File

@ -28,7 +28,7 @@ public class DPackageSearch {
while (true) {
indexGenerator.run();
try {
Thread.sleep(Duration.ofMinutes(1));
Thread.sleep(Duration.ofMinutes(5));
} catch (InterruptedException e) {
System.err.println("Indexing thread interrupted: " + e.getMessage());
break;

View File

@ -7,11 +7,15 @@ import java.time.LocalDateTime;
* @param name The name of the package.
* @param categories The list of categories the package is in.
* @param versions The known list of versions for this package.
* @param fetchedAt Timestamp for when this package was fetched exactly, so
* that later indexing can use a unified timestamp for reference.
*/
public record PackageInfo(
String name,
String[] categories,
VersionInfo[] versions
VersionInfo[] versions,
long totalDownloads,
LocalDateTime fetchedAt
) {
/**
* Information about a specific version of a D package.

View File

@ -79,7 +79,7 @@ public final class WebApiRunner extends Handler.Abstract implements Runnable {
if (idx != -1) {
String key = pair.substring(0, idx);
if (key.trim().equalsIgnoreCase("query")) {
return pair.substring(idx + 1).trim().toUpperCase();
return pair.substring(idx + 1).trim().toLowerCase();
}
}
}

View File

@ -43,6 +43,7 @@ public class DubRegistryPackageFetcher implements PackageFetcher {
if (response.statusCode() != 200) {
throw new IOException("Response status code " + response.statusCode());
}
LocalDateTime fetchedAt = LocalDateTime.now(ZoneOffset.UTC);
ObjectMapper mapper = new ObjectMapper();
try (var in = new GZIPInputStream(response.body())) {
ArrayNode array = mapper.readValue(in, ArrayNode.class);
@ -50,7 +51,7 @@ public class DubRegistryPackageFetcher implements PackageFetcher {
for (JsonNode node : array) {
if (node.isObject()) {
try {
packages.add(parsePackage((ObjectNode) node));
packages.add(parsePackage((ObjectNode) node, fetchedAt));
} catch (Exception e) {
e.printStackTrace();
}
@ -63,11 +64,13 @@ public class DubRegistryPackageFetcher implements PackageFetcher {
}
}
private PackageInfo parsePackage(ObjectNode obj) {
private PackageInfo parsePackage(ObjectNode obj, LocalDateTime fetchedAt) {
return new PackageInfo(
obj.get("name").asText(),
mapJsonArray(obj.withArray("categories"), JsonNode::asText).toArray(new String[0]),
mapJsonArray(obj.withArray("versions"), this::parseVersion).toArray(new PackageInfo.VersionInfo[0])
mapJsonArray(obj.withArray("versions"), this::parseVersion).toArray(new PackageInfo.VersionInfo[0]),
obj.get("stats").get("downloads").get("total").asLong(),
fetchedAt
);
}
@ -97,6 +100,13 @@ public class DubRegistryPackageFetcher implements PackageFetcher {
);
}
/**
* Maps a JSON array to a list of objects, using a mapping function.
* @param array The JSON array.
* @param mapper The mapper function to apply to each element of the array.
* @return The mapped list of objects.
* @param <T> The type of the resultant list elements.
*/
private static <T> List<T> mapJsonArray(ArrayNode array, Function<JsonNode, T> mapper) {
List<T> list = new ArrayList<>(array.size());
for (JsonNode node : array) {

View File

@ -4,18 +4,23 @@ import com.andrewlalis.d_package_search.PackageIndexer;
import com.andrewlalis.d_package_search.PackageInfo;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.*;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexableFieldType;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import java.io.IOException;
import java.nio.file.Path;
import java.time.Duration;
import java.time.ZoneOffset;
import java.util.*;
/**
* An indexer that produces a Lucene index, which is a directory, composed of
* possibly many index segments.
*/
public class LucenePackageIndexer implements PackageIndexer {
private final IndexWriter indexWriter;
private final Directory dir;
@ -30,13 +35,70 @@ public class LucenePackageIndexer implements PackageIndexer {
this.indexWriter = new IndexWriter(dir, config);
}
/**
* Adds a package to the Lucene index. This is the central place where the
* index's fields are defined. We define the following fields:
* <ul>
* <li>name (text, stored)</li>
* <li>url (stored only)</li>
* <li>categories (multivalued string field with value for each category).</li>
* <li>latestVersionTimestamp (string field with date of latest version).</li>
* <li>description (optional text field)</li>
* <li>license (optional string field)</li>
* <li>readme (optional text field)</li>
* <li>
* features (feature field with the following features useful for scoring)
* <ul>
* <li>recency (0 - 1 value indicating how recent the package is)</li>
* <li>downloads (total downloads for the package)</li>
* </ul>
* </li>
* </ul>
* @param info The package to index.
* @throws IOException If an error occurs.
*/
@Override
public void addToIndex(PackageInfo info) throws IOException {
if (info.versions().length == 0) {
System.out.println("Skipping package \"" + info.name() + "\" because there are no versions available.");
return;
}
System.out.println("Indexing package \"" + info.name() + "\".");
String dubUrl = "https://code.dlang.org/packages/" + info.name();
List<PackageInfo.VersionInfo> allVersions = new ArrayList<>(Arrays.asList(info.versions()));
allVersions.sort(Comparator.comparing(PackageInfo.VersionInfo::timestamp).reversed());
var recentVersions = allVersions.subList(0, Math.min(5, allVersions.size()));
Document doc = new Document();
doc.add(new TextField("name", info.name(), Field.Store.YES));
doc.add(new StoredField("url", dubUrl));
for (String category : info.categories()) {
doc.add(new StringField("categories", category, Field.Store.NO));
}
PackageInfo.VersionInfo latestVersion = recentVersions.getFirst();
doc.add(new StringField(
"latestVersionTimestamp",
DateTools.dateToString(Date.from(latestVersion.timestamp().toInstant(ZoneOffset.UTC)), DateTools.Resolution.SECOND),
Field.Store.NO
));
if (latestVersion.description() != null) {
doc.add(new TextField("description", latestVersion.description(), Field.Store.NO));
}
if (latestVersion.license() != null) {
doc.add(new StringField("license", latestVersion.license(), Field.Store.NO));
}
if (latestVersion.readmeText() != null) {
doc.add(new TextField("readme", latestVersion.readmeText(), Field.Store.NO));
}
// Add FeatureFields to score packages based on some metrics.
int daysSinceUpdate = Math.clamp(Duration.between(latestVersion.timestamp(), info.fetchedAt()).toDays(), 1, 365 * 3);
float recency = 1f / daysSinceUpdate;
float downloadsScore = Math.clamp(info.totalDownloads(), 0.001f, Float.MAX_VALUE);
doc.add(new FeatureField("features", "recency", recency));
doc.add(new FeatureField("features", "downloads", downloadsScore));
indexWriter.addDocument(doc);
}

View File

@ -3,6 +3,7 @@ package com.andrewlalis.d_package_search.impl;
import com.andrewlalis.d_package_search.PackageSearchResult;
import com.andrewlalis.d_package_search.PackageSearcher;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.FeatureField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*;
@ -11,12 +12,13 @@ import org.apache.lucene.store.FSDirectory;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.SequencedCollection;
import java.util.*;
import java.util.concurrent.Executors;
/**
* A package searcher implementation that uses a weighted wildcard query to
* search a Lucene index.
*/
public class LucenePackageSearcher implements PackageSearcher {
private final Path indexPath;
@ -52,12 +54,28 @@ public class LucenePackageSearcher implements PackageSearcher {
private Query buildQuery(String queryText) {
BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder();
String[] searchTerms = queryText.toLowerCase().split("\\s+");
for (String searchTerm : searchTerms) {
String wildcardTerm = searchTerm + "*";
Query basicQuery = new WildcardQuery(new Term("name", wildcardTerm));
queryBuilder.add(new BoostQuery(basicQuery, 1f), BooleanClause.Occur.SHOULD);
Map<String, Float> weightedFields = Map.of(
"name", 1f,
"description", 0.5f,
"readme", 0.25f
);
for (int i = 0; i < Math.min(5, searchTerms.length); i++) {
for (var entry : weightedFields.entrySet()) {
String fieldName = entry.getKey();
float fieldWeight = entry.getValue();
Query termQuery = new BoostQuery(new PrefixQuery(new Term(fieldName, searchTerms[i])), fieldWeight);
queryBuilder.add(termQuery, BooleanClause.Occur.SHOULD);
}
}
return queryBuilder.build();
Query baseQuery = queryBuilder.build();
Query boostedQuery = new BooleanQuery.Builder()
.add(baseQuery, BooleanClause.Occur.MUST)
.add(FeatureField.newSaturationQuery("features", "recency"), BooleanClause.Occur.SHOULD)
.add(FeatureField.newSaturationQuery("features", "downloads"), BooleanClause.Occur.SHOULD)
.build();
return boostedQuery;
}
private PackageSearchResult prepareResult(Document doc) {