Added more complex indexing and search query.
This commit is contained in:
parent
ddc69c1c68
commit
07be6b3878
|
@ -5,3 +5,12 @@ An indexer and search API for D programming language packages as registered on h
|
|||
## Setup
|
||||
|
||||
To set up and run the program, all you need is Java version 21 or higher, and then run the project using your favorite IDE. It will boot up a web server that you can use to search for packages at http://localhost:8080/search?query=test, replacing `query=test` with what you want to search for.
|
||||
|
||||
## Architecture
|
||||
|
||||
The basic architecture of this searcher is that of your classic indexed search engine, which is usually comprised of the following steps:
|
||||
1. Fetch raw data from somewhere.
|
||||
2. Generate an index from that data.
|
||||
3. Search for relevant data using the index.
|
||||
|
||||
In this application, steps 1 and 2 are done periodically in a separate thread, to ensure that the data stays relatively fresh. Step 3 is done whenever a request to the `/search` endpoint is received.
|
||||
|
|
|
@ -28,7 +28,7 @@ public class DPackageSearch {
|
|||
while (true) {
|
||||
indexGenerator.run();
|
||||
try {
|
||||
Thread.sleep(Duration.ofMinutes(1));
|
||||
Thread.sleep(Duration.ofMinutes(5));
|
||||
} catch (InterruptedException e) {
|
||||
System.err.println("Indexing thread interrupted: " + e.getMessage());
|
||||
break;
|
||||
|
|
|
@ -7,11 +7,15 @@ import java.time.LocalDateTime;
|
|||
* @param name The name of the package.
|
||||
* @param categories The list of categories the package is in.
|
||||
* @param versions The known list of versions for this package.
|
||||
* @param fetchedAt Timestamp for when this package was fetched exactly, so
|
||||
* that later indexing can use a unified timestamp for reference.
|
||||
*/
|
||||
public record PackageInfo(
|
||||
String name,
|
||||
String[] categories,
|
||||
VersionInfo[] versions
|
||||
VersionInfo[] versions,
|
||||
long totalDownloads,
|
||||
LocalDateTime fetchedAt
|
||||
) {
|
||||
/**
|
||||
* Information about a specific version of a D package.
|
||||
|
|
|
@ -79,7 +79,7 @@ public final class WebApiRunner extends Handler.Abstract implements Runnable {
|
|||
if (idx != -1) {
|
||||
String key = pair.substring(0, idx);
|
||||
if (key.trim().equalsIgnoreCase("query")) {
|
||||
return pair.substring(idx + 1).trim().toUpperCase();
|
||||
return pair.substring(idx + 1).trim().toLowerCase();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -43,6 +43,7 @@ public class DubRegistryPackageFetcher implements PackageFetcher {
|
|||
if (response.statusCode() != 200) {
|
||||
throw new IOException("Response status code " + response.statusCode());
|
||||
}
|
||||
LocalDateTime fetchedAt = LocalDateTime.now(ZoneOffset.UTC);
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
try (var in = new GZIPInputStream(response.body())) {
|
||||
ArrayNode array = mapper.readValue(in, ArrayNode.class);
|
||||
|
@ -50,7 +51,7 @@ public class DubRegistryPackageFetcher implements PackageFetcher {
|
|||
for (JsonNode node : array) {
|
||||
if (node.isObject()) {
|
||||
try {
|
||||
packages.add(parsePackage((ObjectNode) node));
|
||||
packages.add(parsePackage((ObjectNode) node, fetchedAt));
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
@ -63,11 +64,13 @@ public class DubRegistryPackageFetcher implements PackageFetcher {
|
|||
}
|
||||
}
|
||||
|
||||
private PackageInfo parsePackage(ObjectNode obj) {
|
||||
private PackageInfo parsePackage(ObjectNode obj, LocalDateTime fetchedAt) {
|
||||
return new PackageInfo(
|
||||
obj.get("name").asText(),
|
||||
mapJsonArray(obj.withArray("categories"), JsonNode::asText).toArray(new String[0]),
|
||||
mapJsonArray(obj.withArray("versions"), this::parseVersion).toArray(new PackageInfo.VersionInfo[0])
|
||||
mapJsonArray(obj.withArray("versions"), this::parseVersion).toArray(new PackageInfo.VersionInfo[0]),
|
||||
obj.get("stats").get("downloads").get("total").asLong(),
|
||||
fetchedAt
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -97,6 +100,13 @@ public class DubRegistryPackageFetcher implements PackageFetcher {
|
|||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Maps a JSON array to a list of objects, using a mapping function.
|
||||
* @param array The JSON array.
|
||||
* @param mapper The mapper function to apply to each element of the array.
|
||||
* @return The mapped list of objects.
|
||||
* @param <T> The type of the resultant list elements.
|
||||
*/
|
||||
private static <T> List<T> mapJsonArray(ArrayNode array, Function<JsonNode, T> mapper) {
|
||||
List<T> list = new ArrayList<>(array.size());
|
||||
for (JsonNode node : array) {
|
||||
|
|
|
@ -4,18 +4,23 @@ import com.andrewlalis.d_package_search.PackageIndexer;
|
|||
import com.andrewlalis.d_package_search.PackageInfo;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.StoredField;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.document.*;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.IndexableFieldType;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.time.Duration;
|
||||
import java.time.ZoneOffset;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* An indexer that produces a Lucene index, which is a directory, composed of
|
||||
* possibly many index segments.
|
||||
*/
|
||||
public class LucenePackageIndexer implements PackageIndexer {
|
||||
private final IndexWriter indexWriter;
|
||||
private final Directory dir;
|
||||
|
@ -30,13 +35,70 @@ public class LucenePackageIndexer implements PackageIndexer {
|
|||
this.indexWriter = new IndexWriter(dir, config);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a package to the Lucene index. This is the central place where the
|
||||
* index's fields are defined. We define the following fields:
|
||||
* <ul>
|
||||
* <li>name (text, stored)</li>
|
||||
* <li>url (stored only)</li>
|
||||
* <li>categories (multivalued string field with value for each category).</li>
|
||||
* <li>latestVersionTimestamp (string field with date of latest version).</li>
|
||||
* <li>description (optional text field)</li>
|
||||
* <li>license (optional string field)</li>
|
||||
* <li>readme (optional text field)</li>
|
||||
* <li>
|
||||
* features (feature field with the following features useful for scoring)
|
||||
* <ul>
|
||||
* <li>recency (0 - 1 value indicating how recent the package is)</li>
|
||||
* <li>downloads (total downloads for the package)</li>
|
||||
* </ul>
|
||||
* </li>
|
||||
* </ul>
|
||||
* @param info The package to index.
|
||||
* @throws IOException If an error occurs.
|
||||
*/
|
||||
@Override
|
||||
public void addToIndex(PackageInfo info) throws IOException {
|
||||
if (info.versions().length == 0) {
|
||||
System.out.println("Skipping package \"" + info.name() + "\" because there are no versions available.");
|
||||
return;
|
||||
}
|
||||
System.out.println("Indexing package \"" + info.name() + "\".");
|
||||
String dubUrl = "https://code.dlang.org/packages/" + info.name();
|
||||
List<PackageInfo.VersionInfo> allVersions = new ArrayList<>(Arrays.asList(info.versions()));
|
||||
allVersions.sort(Comparator.comparing(PackageInfo.VersionInfo::timestamp).reversed());
|
||||
var recentVersions = allVersions.subList(0, Math.min(5, allVersions.size()));
|
||||
|
||||
Document doc = new Document();
|
||||
doc.add(new TextField("name", info.name(), Field.Store.YES));
|
||||
doc.add(new StoredField("url", dubUrl));
|
||||
for (String category : info.categories()) {
|
||||
doc.add(new StringField("categories", category, Field.Store.NO));
|
||||
}
|
||||
|
||||
PackageInfo.VersionInfo latestVersion = recentVersions.getFirst();
|
||||
doc.add(new StringField(
|
||||
"latestVersionTimestamp",
|
||||
DateTools.dateToString(Date.from(latestVersion.timestamp().toInstant(ZoneOffset.UTC)), DateTools.Resolution.SECOND),
|
||||
Field.Store.NO
|
||||
));
|
||||
if (latestVersion.description() != null) {
|
||||
doc.add(new TextField("description", latestVersion.description(), Field.Store.NO));
|
||||
}
|
||||
if (latestVersion.license() != null) {
|
||||
doc.add(new StringField("license", latestVersion.license(), Field.Store.NO));
|
||||
}
|
||||
if (latestVersion.readmeText() != null) {
|
||||
doc.add(new TextField("readme", latestVersion.readmeText(), Field.Store.NO));
|
||||
}
|
||||
|
||||
// Add FeatureFields to score packages based on some metrics.
|
||||
int daysSinceUpdate = Math.clamp(Duration.between(latestVersion.timestamp(), info.fetchedAt()).toDays(), 1, 365 * 3);
|
||||
float recency = 1f / daysSinceUpdate;
|
||||
float downloadsScore = Math.clamp(info.totalDownloads(), 0.001f, Float.MAX_VALUE);
|
||||
doc.add(new FeatureField("features", "recency", recency));
|
||||
doc.add(new FeatureField("features", "downloads", downloadsScore));
|
||||
|
||||
indexWriter.addDocument(doc);
|
||||
}
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@ package com.andrewlalis.d_package_search.impl;
|
|||
import com.andrewlalis.d_package_search.PackageSearchResult;
|
||||
import com.andrewlalis.d_package_search.PackageSearcher;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.FeatureField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.*;
|
||||
|
@ -11,12 +12,13 @@ import org.apache.lucene.store.FSDirectory;
|
|||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.SequencedCollection;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.Executors;
|
||||
|
||||
/**
|
||||
* A package searcher implementation that uses a weighted wildcard query to
|
||||
* search a Lucene index.
|
||||
*/
|
||||
public class LucenePackageSearcher implements PackageSearcher {
|
||||
private final Path indexPath;
|
||||
|
||||
|
@ -52,12 +54,28 @@ public class LucenePackageSearcher implements PackageSearcher {
|
|||
private Query buildQuery(String queryText) {
|
||||
BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder();
|
||||
String[] searchTerms = queryText.toLowerCase().split("\\s+");
|
||||
for (String searchTerm : searchTerms) {
|
||||
String wildcardTerm = searchTerm + "*";
|
||||
Query basicQuery = new WildcardQuery(new Term("name", wildcardTerm));
|
||||
queryBuilder.add(new BoostQuery(basicQuery, 1f), BooleanClause.Occur.SHOULD);
|
||||
|
||||
Map<String, Float> weightedFields = Map.of(
|
||||
"name", 1f,
|
||||
"description", 0.5f,
|
||||
"readme", 0.25f
|
||||
);
|
||||
|
||||
for (int i = 0; i < Math.min(5, searchTerms.length); i++) {
|
||||
for (var entry : weightedFields.entrySet()) {
|
||||
String fieldName = entry.getKey();
|
||||
float fieldWeight = entry.getValue();
|
||||
Query termQuery = new BoostQuery(new PrefixQuery(new Term(fieldName, searchTerms[i])), fieldWeight);
|
||||
queryBuilder.add(termQuery, BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
return queryBuilder.build();
|
||||
}
|
||||
Query baseQuery = queryBuilder.build();
|
||||
Query boostedQuery = new BooleanQuery.Builder()
|
||||
.add(baseQuery, BooleanClause.Occur.MUST)
|
||||
.add(FeatureField.newSaturationQuery("features", "recency"), BooleanClause.Occur.SHOULD)
|
||||
.add(FeatureField.newSaturationQuery("features", "downloads"), BooleanClause.Occur.SHOULD)
|
||||
.build();
|
||||
return boostedQuery;
|
||||
}
|
||||
|
||||
private PackageSearchResult prepareResult(Document doc) {
|
||||
|
|
Loading…
Reference in New Issue