Added more complex indexing and search query.
This commit is contained in:
parent
ddc69c1c68
commit
07be6b3878
|
@ -5,3 +5,12 @@ An indexer and search API for D programming language packages as registered on h
|
||||||
## Setup
|
## Setup
|
||||||
|
|
||||||
To set up and run the program, all you need is Java version 21 or higher, and then run the project using your favorite IDE. It will boot up a web server that you can use to search for packages at http://localhost:8080/search?query=test, replacing `query=test` with what you want to search for.
|
To set up and run the program, all you need is Java version 21 or higher, and then run the project using your favorite IDE. It will boot up a web server that you can use to search for packages at http://localhost:8080/search?query=test, replacing `query=test` with what you want to search for.
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
The basic architecture of this searcher is that of your classic indexed search engine, which is usually comprised of the following steps:
|
||||||
|
1. Fetch raw data from somewhere.
|
||||||
|
2. Generate an index from that data.
|
||||||
|
3. Search for relevant data using the index.
|
||||||
|
|
||||||
|
In this application, steps 1 and 2 are done periodically in a separate thread, to ensure that the data stays relatively fresh. Step 3 is done whenever a request to the `/search` endpoint is received.
|
||||||
|
|
|
@ -28,7 +28,7 @@ public class DPackageSearch {
|
||||||
while (true) {
|
while (true) {
|
||||||
indexGenerator.run();
|
indexGenerator.run();
|
||||||
try {
|
try {
|
||||||
Thread.sleep(Duration.ofMinutes(1));
|
Thread.sleep(Duration.ofMinutes(5));
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
System.err.println("Indexing thread interrupted: " + e.getMessage());
|
System.err.println("Indexing thread interrupted: " + e.getMessage());
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -7,11 +7,15 @@ import java.time.LocalDateTime;
|
||||||
* @param name The name of the package.
|
* @param name The name of the package.
|
||||||
* @param categories The list of categories the package is in.
|
* @param categories The list of categories the package is in.
|
||||||
* @param versions The known list of versions for this package.
|
* @param versions The known list of versions for this package.
|
||||||
|
* @param fetchedAt Timestamp for when this package was fetched exactly, so
|
||||||
|
* that later indexing can use a unified timestamp for reference.
|
||||||
*/
|
*/
|
||||||
public record PackageInfo(
|
public record PackageInfo(
|
||||||
String name,
|
String name,
|
||||||
String[] categories,
|
String[] categories,
|
||||||
VersionInfo[] versions
|
VersionInfo[] versions,
|
||||||
|
long totalDownloads,
|
||||||
|
LocalDateTime fetchedAt
|
||||||
) {
|
) {
|
||||||
/**
|
/**
|
||||||
* Information about a specific version of a D package.
|
* Information about a specific version of a D package.
|
||||||
|
|
|
@ -79,7 +79,7 @@ public final class WebApiRunner extends Handler.Abstract implements Runnable {
|
||||||
if (idx != -1) {
|
if (idx != -1) {
|
||||||
String key = pair.substring(0, idx);
|
String key = pair.substring(0, idx);
|
||||||
if (key.trim().equalsIgnoreCase("query")) {
|
if (key.trim().equalsIgnoreCase("query")) {
|
||||||
return pair.substring(idx + 1).trim().toUpperCase();
|
return pair.substring(idx + 1).trim().toLowerCase();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -43,6 +43,7 @@ public class DubRegistryPackageFetcher implements PackageFetcher {
|
||||||
if (response.statusCode() != 200) {
|
if (response.statusCode() != 200) {
|
||||||
throw new IOException("Response status code " + response.statusCode());
|
throw new IOException("Response status code " + response.statusCode());
|
||||||
}
|
}
|
||||||
|
LocalDateTime fetchedAt = LocalDateTime.now(ZoneOffset.UTC);
|
||||||
ObjectMapper mapper = new ObjectMapper();
|
ObjectMapper mapper = new ObjectMapper();
|
||||||
try (var in = new GZIPInputStream(response.body())) {
|
try (var in = new GZIPInputStream(response.body())) {
|
||||||
ArrayNode array = mapper.readValue(in, ArrayNode.class);
|
ArrayNode array = mapper.readValue(in, ArrayNode.class);
|
||||||
|
@ -50,7 +51,7 @@ public class DubRegistryPackageFetcher implements PackageFetcher {
|
||||||
for (JsonNode node : array) {
|
for (JsonNode node : array) {
|
||||||
if (node.isObject()) {
|
if (node.isObject()) {
|
||||||
try {
|
try {
|
||||||
packages.add(parsePackage((ObjectNode) node));
|
packages.add(parsePackage((ObjectNode) node, fetchedAt));
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
|
@ -63,11 +64,13 @@ public class DubRegistryPackageFetcher implements PackageFetcher {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private PackageInfo parsePackage(ObjectNode obj) {
|
private PackageInfo parsePackage(ObjectNode obj, LocalDateTime fetchedAt) {
|
||||||
return new PackageInfo(
|
return new PackageInfo(
|
||||||
obj.get("name").asText(),
|
obj.get("name").asText(),
|
||||||
mapJsonArray(obj.withArray("categories"), JsonNode::asText).toArray(new String[0]),
|
mapJsonArray(obj.withArray("categories"), JsonNode::asText).toArray(new String[0]),
|
||||||
mapJsonArray(obj.withArray("versions"), this::parseVersion).toArray(new PackageInfo.VersionInfo[0])
|
mapJsonArray(obj.withArray("versions"), this::parseVersion).toArray(new PackageInfo.VersionInfo[0]),
|
||||||
|
obj.get("stats").get("downloads").get("total").asLong(),
|
||||||
|
fetchedAt
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -97,6 +100,13 @@ public class DubRegistryPackageFetcher implements PackageFetcher {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Maps a JSON array to a list of objects, using a mapping function.
|
||||||
|
* @param array The JSON array.
|
||||||
|
* @param mapper The mapper function to apply to each element of the array.
|
||||||
|
* @return The mapped list of objects.
|
||||||
|
* @param <T> The type of the resultant list elements.
|
||||||
|
*/
|
||||||
private static <T> List<T> mapJsonArray(ArrayNode array, Function<JsonNode, T> mapper) {
|
private static <T> List<T> mapJsonArray(ArrayNode array, Function<JsonNode, T> mapper) {
|
||||||
List<T> list = new ArrayList<>(array.size());
|
List<T> list = new ArrayList<>(array.size());
|
||||||
for (JsonNode node : array) {
|
for (JsonNode node : array) {
|
||||||
|
|
|
@ -4,18 +4,23 @@ import com.andrewlalis.d_package_search.PackageIndexer;
|
||||||
import com.andrewlalis.d_package_search.PackageInfo;
|
import com.andrewlalis.d_package_search.PackageInfo;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.*;
|
||||||
import org.apache.lucene.document.Field;
|
|
||||||
import org.apache.lucene.document.StoredField;
|
|
||||||
import org.apache.lucene.document.TextField;
|
|
||||||
import org.apache.lucene.index.IndexWriter;
|
import org.apache.lucene.index.IndexWriter;
|
||||||
import org.apache.lucene.index.IndexWriterConfig;
|
import org.apache.lucene.index.IndexWriterConfig;
|
||||||
|
import org.apache.lucene.index.IndexableFieldType;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.FSDirectory;
|
import org.apache.lucene.store.FSDirectory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.time.ZoneOffset;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An indexer that produces a Lucene index, which is a directory, composed of
|
||||||
|
* possibly many index segments.
|
||||||
|
*/
|
||||||
public class LucenePackageIndexer implements PackageIndexer {
|
public class LucenePackageIndexer implements PackageIndexer {
|
||||||
private final IndexWriter indexWriter;
|
private final IndexWriter indexWriter;
|
||||||
private final Directory dir;
|
private final Directory dir;
|
||||||
|
@ -30,13 +35,70 @@ public class LucenePackageIndexer implements PackageIndexer {
|
||||||
this.indexWriter = new IndexWriter(dir, config);
|
this.indexWriter = new IndexWriter(dir, config);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adds a package to the Lucene index. This is the central place where the
|
||||||
|
* index's fields are defined. We define the following fields:
|
||||||
|
* <ul>
|
||||||
|
* <li>name (text, stored)</li>
|
||||||
|
* <li>url (stored only)</li>
|
||||||
|
* <li>categories (multivalued string field with value for each category).</li>
|
||||||
|
* <li>latestVersionTimestamp (string field with date of latest version).</li>
|
||||||
|
* <li>description (optional text field)</li>
|
||||||
|
* <li>license (optional string field)</li>
|
||||||
|
* <li>readme (optional text field)</li>
|
||||||
|
* <li>
|
||||||
|
* features (feature field with the following features useful for scoring)
|
||||||
|
* <ul>
|
||||||
|
* <li>recency (0 - 1 value indicating how recent the package is)</li>
|
||||||
|
* <li>downloads (total downloads for the package)</li>
|
||||||
|
* </ul>
|
||||||
|
* </li>
|
||||||
|
* </ul>
|
||||||
|
* @param info The package to index.
|
||||||
|
* @throws IOException If an error occurs.
|
||||||
|
*/
|
||||||
@Override
|
@Override
|
||||||
public void addToIndex(PackageInfo info) throws IOException {
|
public void addToIndex(PackageInfo info) throws IOException {
|
||||||
|
if (info.versions().length == 0) {
|
||||||
|
System.out.println("Skipping package \"" + info.name() + "\" because there are no versions available.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
System.out.println("Indexing package \"" + info.name() + "\".");
|
||||||
String dubUrl = "https://code.dlang.org/packages/" + info.name();
|
String dubUrl = "https://code.dlang.org/packages/" + info.name();
|
||||||
|
List<PackageInfo.VersionInfo> allVersions = new ArrayList<>(Arrays.asList(info.versions()));
|
||||||
|
allVersions.sort(Comparator.comparing(PackageInfo.VersionInfo::timestamp).reversed());
|
||||||
|
var recentVersions = allVersions.subList(0, Math.min(5, allVersions.size()));
|
||||||
|
|
||||||
Document doc = new Document();
|
Document doc = new Document();
|
||||||
doc.add(new TextField("name", info.name(), Field.Store.YES));
|
doc.add(new TextField("name", info.name(), Field.Store.YES));
|
||||||
doc.add(new StoredField("url", dubUrl));
|
doc.add(new StoredField("url", dubUrl));
|
||||||
|
for (String category : info.categories()) {
|
||||||
|
doc.add(new StringField("categories", category, Field.Store.NO));
|
||||||
|
}
|
||||||
|
|
||||||
|
PackageInfo.VersionInfo latestVersion = recentVersions.getFirst();
|
||||||
|
doc.add(new StringField(
|
||||||
|
"latestVersionTimestamp",
|
||||||
|
DateTools.dateToString(Date.from(latestVersion.timestamp().toInstant(ZoneOffset.UTC)), DateTools.Resolution.SECOND),
|
||||||
|
Field.Store.NO
|
||||||
|
));
|
||||||
|
if (latestVersion.description() != null) {
|
||||||
|
doc.add(new TextField("description", latestVersion.description(), Field.Store.NO));
|
||||||
|
}
|
||||||
|
if (latestVersion.license() != null) {
|
||||||
|
doc.add(new StringField("license", latestVersion.license(), Field.Store.NO));
|
||||||
|
}
|
||||||
|
if (latestVersion.readmeText() != null) {
|
||||||
|
doc.add(new TextField("readme", latestVersion.readmeText(), Field.Store.NO));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add FeatureFields to score packages based on some metrics.
|
||||||
|
int daysSinceUpdate = Math.clamp(Duration.between(latestVersion.timestamp(), info.fetchedAt()).toDays(), 1, 365 * 3);
|
||||||
|
float recency = 1f / daysSinceUpdate;
|
||||||
|
float downloadsScore = Math.clamp(info.totalDownloads(), 0.001f, Float.MAX_VALUE);
|
||||||
|
doc.add(new FeatureField("features", "recency", recency));
|
||||||
|
doc.add(new FeatureField("features", "downloads", downloadsScore));
|
||||||
|
|
||||||
indexWriter.addDocument(doc);
|
indexWriter.addDocument(doc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,7 @@ package com.andrewlalis.d_package_search.impl;
|
||||||
import com.andrewlalis.d_package_search.PackageSearchResult;
|
import com.andrewlalis.d_package_search.PackageSearchResult;
|
||||||
import com.andrewlalis.d_package_search.PackageSearcher;
|
import com.andrewlalis.d_package_search.PackageSearcher;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.FeatureField;
|
||||||
import org.apache.lucene.index.DirectoryReader;
|
import org.apache.lucene.index.DirectoryReader;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.search.*;
|
import org.apache.lucene.search.*;
|
||||||
|
@ -11,12 +12,13 @@ import org.apache.lucene.store.FSDirectory;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.SequencedCollection;
|
|
||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Executors;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A package searcher implementation that uses a weighted wildcard query to
|
||||||
|
* search a Lucene index.
|
||||||
|
*/
|
||||||
public class LucenePackageSearcher implements PackageSearcher {
|
public class LucenePackageSearcher implements PackageSearcher {
|
||||||
private final Path indexPath;
|
private final Path indexPath;
|
||||||
|
|
||||||
|
@ -52,12 +54,28 @@ public class LucenePackageSearcher implements PackageSearcher {
|
||||||
private Query buildQuery(String queryText) {
|
private Query buildQuery(String queryText) {
|
||||||
BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder();
|
BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder();
|
||||||
String[] searchTerms = queryText.toLowerCase().split("\\s+");
|
String[] searchTerms = queryText.toLowerCase().split("\\s+");
|
||||||
for (String searchTerm : searchTerms) {
|
|
||||||
String wildcardTerm = searchTerm + "*";
|
Map<String, Float> weightedFields = Map.of(
|
||||||
Query basicQuery = new WildcardQuery(new Term("name", wildcardTerm));
|
"name", 1f,
|
||||||
queryBuilder.add(new BoostQuery(basicQuery, 1f), BooleanClause.Occur.SHOULD);
|
"description", 0.5f,
|
||||||
|
"readme", 0.25f
|
||||||
|
);
|
||||||
|
|
||||||
|
for (int i = 0; i < Math.min(5, searchTerms.length); i++) {
|
||||||
|
for (var entry : weightedFields.entrySet()) {
|
||||||
|
String fieldName = entry.getKey();
|
||||||
|
float fieldWeight = entry.getValue();
|
||||||
|
Query termQuery = new BoostQuery(new PrefixQuery(new Term(fieldName, searchTerms[i])), fieldWeight);
|
||||||
|
queryBuilder.add(termQuery, BooleanClause.Occur.SHOULD);
|
||||||
}
|
}
|
||||||
return queryBuilder.build();
|
}
|
||||||
|
Query baseQuery = queryBuilder.build();
|
||||||
|
Query boostedQuery = new BooleanQuery.Builder()
|
||||||
|
.add(baseQuery, BooleanClause.Occur.MUST)
|
||||||
|
.add(FeatureField.newSaturationQuery("features", "recency"), BooleanClause.Occur.SHOULD)
|
||||||
|
.add(FeatureField.newSaturationQuery("features", "downloads"), BooleanClause.Occur.SHOULD)
|
||||||
|
.build();
|
||||||
|
return boostedQuery;
|
||||||
}
|
}
|
||||||
|
|
||||||
private PackageSearchResult prepareResult(Document doc) {
|
private PackageSearchResult prepareResult(Document doc) {
|
||||||
|
|
Loading…
Reference in New Issue