Added web server and query searching (very basic).

This commit is contained in:
Andrew Lalis 2023-10-09 15:51:30 -04:00
parent eb4a66e039
commit ddc69c1c68
18 changed files with 453 additions and 123 deletions

View File

@ -1,2 +1,7 @@
# dub-registry-search
A search implementation for code.dlang.org
# D Package Search
An indexer and search API for D programming language packages as registered on https://code.dlang.org, using Apache Lucene.
## Setup
To set up and run the program, all you need is Java version 21 or higher, and then run the project using your favorite IDE. It will boot up a web server that you can use to search for packages at http://localhost:8080/search?query=test, replacing `query=test` with what you want to search for.

20
pom.xml
View File

@ -4,13 +4,13 @@
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>io.github.andrewlalis</groupId>
<artifactId>dub-registry-search</artifactId>
<groupId>com.andrewlalis</groupId>
<artifactId>d-package-search</artifactId>
<version>1.0.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>17</maven.compiler.source>
<maven.compiler.target>17</maven.compiler.target>
<maven.compiler.source>21</maven.compiler.source>
<maven.compiler.target>21</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
@ -19,15 +19,23 @@
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>9.5.0</version>
<version>9.8.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.fasterxml.jackson.core/jackson-databind -->
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.14.2</version>
<version>2.15.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.eclipse.jetty/jetty-server -->
<dependency>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-server</artifactId>
<version>12.0.1</version>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,39 @@
package com.andrewlalis.d_package_search;
import com.andrewlalis.d_package_search.impl.DubRegistryPackageFetcher;
import com.andrewlalis.d_package_search.impl.LucenePackageIndexer;
import com.andrewlalis.d_package_search.impl.LucenePackageSearcher;
import java.nio.file.Path;
import java.time.Duration;
import java.util.ArrayList;
public class DPackageSearch {
public static void main(String[] args) {
Path indexPath = Path.of("package-index");
startIndexerThread(new IndexGenerator(
new DubRegistryPackageFetcher(),
() -> new LucenePackageIndexer(indexPath)
));
new WebApiRunner(new LucenePackageSearcher(indexPath)).run();
}
/**
* Starts a new (virtual) thread that periodically re-generates the package
* index.
* @param indexGenerator The index generator to use.
*/
public static void startIndexerThread(IndexGenerator indexGenerator) {
Thread.ofVirtual().start(() -> {
while (true) {
indexGenerator.run();
try {
Thread.sleep(Duration.ofMinutes(1));
} catch (InterruptedException e) {
System.err.println("Indexing thread interrupted: " + e.getMessage());
break;
}
}
});
}
}

View File

@ -0,0 +1,45 @@
package com.andrewlalis.d_package_search;
import java.io.IOException;
import java.time.Duration;
import java.time.Instant;
import java.util.Collection;
/**
* The index generator is a component that pieces all the parts of building an
* index together into one runnable. It fetches packages using a fetcher, then
* indexes them using an indexer obtained from the given supplier.
* @param fetcher The fetcher to use to get packages.
* @param indexerSupplier A supplier for a package indexer.
*/
public record IndexGenerator(
PackageFetcher fetcher,
ThrowableSupplier<PackageIndexer> indexerSupplier
) implements Runnable {
@Override
public void run() {
System.out.println("Generating index...");
Instant start;
Duration dur;
start = Instant.now();
Collection<PackageInfo> packages;
try {
packages = fetcher.fetch();
} catch (IOException e) {
System.err.println("Failed to fetch packages: " + e.getMessage());
return;
}
try (PackageIndexer indexer = indexerSupplier.get()) {
dur = Duration.between(start, Instant.now());
System.out.println("Fetched " + packages.size() + " in " + dur.toMillis() + " ms.");
start = Instant.now();
for (var pkg : packages) {
indexer.addToIndex(pkg);
}
dur = Duration.between(start, Instant.now());
System.out.println("Indexed all packages in " + dur.toMillis() + " ms.");
} catch (Exception e) {
throw new RuntimeException(e);
}
}
}

View File

@ -0,0 +1,11 @@
package com.andrewlalis.d_package_search;
import java.io.IOException;
import java.util.Collection;
/**
* A component responsible for fetching up-to-date information about packages.
*/
public interface PackageFetcher {
Collection<PackageInfo> fetch() throws IOException;
}

View File

@ -0,0 +1,12 @@
package com.andrewlalis.d_package_search;
/**
* A package indexer writes information from a given JSON package object to an
* index for searching later.
*/
public interface PackageIndexer extends AutoCloseable {
void addToIndex(PackageInfo info) throws Exception;
@Override
default void close() throws Exception {}
}

View File

@ -0,0 +1,33 @@
package com.andrewlalis.d_package_search;
import java.time.LocalDateTime;
/**
* Information about a D package that is ready for indexing.
* @param name The name of the package.
* @param categories The list of categories the package is in.
* @param versions The known list of versions for this package.
*/
public record PackageInfo(
String name,
String[] categories,
VersionInfo[] versions
) {
/**
* Information about a specific version of a D package.
* @param timestamp The timestamp (in UTC) when the version was published.
* @param versionTag The version tag string (e.g. "1.2.3").
* @param description The version's description, or null.
* @param license The version's license name (like "MIT" or "LGPL"), or null.
* @param authors The list of authors for this version.
* @param readmeText The text content of this version's README file.
*/
public record VersionInfo(
LocalDateTime timestamp,
String versionTag,
String description,
String license,
String[] authors,
String readmeText
) {}
}

View File

@ -0,0 +1,6 @@
package com.andrewlalis.d_package_search;
public record PackageSearchResult(
String name,
String url
) {}

View File

@ -0,0 +1,7 @@
package com.andrewlalis.d_package_search;
import java.util.SequencedCollection;
public interface PackageSearcher {
SequencedCollection<PackageSearchResult> search(String query);
}

View File

@ -0,0 +1,6 @@
package com.andrewlalis.d_package_search;
@FunctionalInterface
public interface ThrowableSupplier<T> {
T get() throws Exception;
}

View File

@ -0,0 +1,88 @@
package com.andrewlalis.d_package_search;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.eclipse.jetty.http.HttpStatus;
import org.eclipse.jetty.http.HttpURI;
import org.eclipse.jetty.server.*;
import org.eclipse.jetty.util.Callback;
import org.eclipse.jetty.util.thread.QueuedThreadPool;
import java.net.URLDecoder;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.SequencedCollection;
import java.util.concurrent.Executor;
import java.util.concurrent.Executors;
/**
* Component that runs a simple HTTP endpoint, defaulting to localhost:8080/search?query=...
* that allows clients to search the index via an HTTP request.
*/
public final class WebApiRunner extends Handler.Abstract implements Runnable {
private final PackageSearcher packageSearcher;
private final ObjectMapper objectMapper;
private final Executor threadPoolExecutor;
public WebApiRunner(PackageSearcher packageSearcher) {
this.packageSearcher = packageSearcher;
this.objectMapper = new ObjectMapper();
this.threadPoolExecutor = Executors.newVirtualThreadPerTaskExecutor();
}
@Override
public void run() {
QueuedThreadPool threadPool = new QueuedThreadPool();
threadPool.setVirtualThreadsExecutor(threadPoolExecutor);
threadPool.setName("http-server");
Server server = new Server(threadPool);
ServerConnector connector = new ServerConnector(server);
connector.setPort(8080);
server.addConnector(connector);
server.setHandler(this);
try {
server.start();
} catch (Exception e) {
throw new RuntimeException(e);
}
}
@Override
public boolean handle(Request request, Response response, Callback callback) throws Exception {
if (request.getMethod().equalsIgnoreCase("GET")) {
HttpURI uri = request.getHttpURI();
if (uri.getPath().equalsIgnoreCase("/search")) {
String query = uri.getQuery() == null ? null : parseQuery(uri);
if (query == null || query.isBlank()) {
response.setStatus(HttpStatus.BAD_REQUEST_400);
response.write(true, ByteBuffer.wrap("Missing required \"query\" parameter.".getBytes(StandardCharsets.UTF_8)), callback);
} else {
System.out.println("Searching with query \"" + query + "\".");
SequencedCollection<PackageSearchResult> results = packageSearcher.search(query);
response.setStatus(HttpStatus.OK_200);
response.getHeaders().add("Content-Type", "application/json; charset=utf-8");
byte[] responseBody = objectMapper.writeValueAsBytes(results);
response.write(true, ByteBuffer.wrap(responseBody), callback);
}
} else {
response.setStatus(HttpStatus.NOT_FOUND_404);
}
} else {
response.setStatus(HttpStatus.METHOD_NOT_ALLOWED_405);
}
callback.succeeded();
return true;
}
private static String parseQuery(HttpURI uri) {
for (String pair : URLDecoder.decode(uri.getQuery(), StandardCharsets.UTF_8).split("&")) {
int idx = pair.indexOf('=');
if (idx != -1) {
String key = pair.substring(0, idx);
if (key.trim().equalsIgnoreCase("query")) {
return pair.substring(idx + 1).trim().toUpperCase();
}
}
}
return null;
}
}

View File

@ -0,0 +1,107 @@
package com.andrewlalis.d_package_search.impl;
import com.andrewlalis.d_package_search.PackageFetcher;
import com.andrewlalis.d_package_search.PackageInfo;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.time.*;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.function.Function;
import java.util.zip.GZIPInputStream;
/**
* A package fetcher that pulls directly from the Dub registry's JSON dump.
*/
public class DubRegistryPackageFetcher implements PackageFetcher {
private final HttpClient httpClient = HttpClient.newBuilder()
.connectTimeout(Duration.ofSeconds(3))
.followRedirects(HttpClient.Redirect.NORMAL)
.build();
private static final String API_URL = "https://code.dlang.org/api/packages/dump";
@Override
public Collection<PackageInfo> fetch() throws IOException {
HttpRequest req = HttpRequest.newBuilder(URI.create(API_URL))
.GET()
.timeout(Duration.ofSeconds(60))
.header("Accept", "application/json")
.header("Accept-Encoding", "gzip")
.build();
try {
HttpResponse<InputStream> response = httpClient.send(req, HttpResponse.BodyHandlers.ofInputStream());
if (response.statusCode() != 200) {
throw new IOException("Response status code " + response.statusCode());
}
ObjectMapper mapper = new ObjectMapper();
try (var in = new GZIPInputStream(response.body())) {
ArrayNode array = mapper.readValue(in, ArrayNode.class);
Collection<PackageInfo> packages = new ArrayList<>();
for (JsonNode node : array) {
if (node.isObject()) {
try {
packages.add(parsePackage((ObjectNode) node));
} catch (Exception e) {
e.printStackTrace();
}
}
}
return packages;
}
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
private PackageInfo parsePackage(ObjectNode obj) {
return new PackageInfo(
obj.get("name").asText(),
mapJsonArray(obj.withArray("categories"), JsonNode::asText).toArray(new String[0]),
mapJsonArray(obj.withArray("versions"), this::parseVersion).toArray(new PackageInfo.VersionInfo[0])
);
}
private PackageInfo.VersionInfo parseVersion(JsonNode node) {
String description = null;
String license = null;
String[] authors = new String[0];
if (node.hasNonNull("info")) {
JsonNode infoNode = node.get("info");
if (infoNode.hasNonNull("description")) {
description = infoNode.get("description").asText();
}
if (infoNode.hasNonNull("license")) {
license = infoNode.get("license").asText();
}
if (infoNode.hasNonNull("authors")) {
authors = mapJsonArray(infoNode.withArray("authors"), JsonNode::asText).toArray(authors);
}
}
return new PackageInfo.VersionInfo(
OffsetDateTime.parse(node.get("date").asText()).atZoneSameInstant(ZoneOffset.UTC).toLocalDateTime(),
node.get("version").asText(),
description,
license,
authors,
node.get("readme").asText()
);
}
private static <T> List<T> mapJsonArray(ArrayNode array, Function<JsonNode, T> mapper) {
List<T> list = new ArrayList<>(array.size());
for (JsonNode node : array) {
list.add(mapper.apply(node));
}
return list;
}
}

View File

@ -1,6 +1,7 @@
package io.github.andrewlalis.dub_registry_search;
package com.andrewlalis.d_package_search.impl;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.andrewlalis.d_package_search.PackageIndexer;
import com.andrewlalis.d_package_search.PackageInfo;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
@ -15,7 +16,7 @@ import org.apache.lucene.store.FSDirectory;
import java.io.IOException;
import java.nio.file.Path;
public class LucenePackageIndexer implements PackageIndexer, AutoCloseable {
public class LucenePackageIndexer implements PackageIndexer {
private final IndexWriter indexWriter;
private final Directory dir;
private final Analyzer analyzer;
@ -29,17 +30,14 @@ public class LucenePackageIndexer implements PackageIndexer, AutoCloseable {
this.indexWriter = new IndexWriter(dir, config);
}
@Override
public void addToIndex(ObjectNode packageJson) throws IOException {
String registryId = packageJson.get("_id").asText();
String name = packageJson.get("name").asText();
String dubUrl = "https://code.dlang.org/packages/" + name;
public void addToIndex(PackageInfo info) throws IOException {
String dubUrl = "https://code.dlang.org/packages/" + info.name();
Document doc = new Document();
doc.add(new StoredField("registryId", registryId));
doc.add(new TextField("name", name, Field.Store.YES));
doc.add(new StoredField("dubUrl", dubUrl));
doc.add(new TextField("name", info.name(), Field.Store.YES));
doc.add(new StoredField("url", dubUrl));
indexWriter.addDocument(doc);
}
@Override

View File

@ -0,0 +1,69 @@
package com.andrewlalis.d_package_search.impl;
import com.andrewlalis.d_package_search.PackageSearchResult;
import com.andrewlalis.d_package_search.PackageSearcher;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*;
import org.apache.lucene.store.FSDirectory;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.SequencedCollection;
import java.util.concurrent.Executors;
public class LucenePackageSearcher implements PackageSearcher {
private final Path indexPath;
public LucenePackageSearcher(Path indexPath) {
this.indexPath = indexPath;
}
@Override
public SequencedCollection<PackageSearchResult> search(String query) {
if (query == null || query.isBlank() || Files.notExists(indexPath)) return Collections.emptyList();
Query luceneQuery = buildQuery(query);
try (DirectoryReader dirReader = DirectoryReader.open(FSDirectory.open(indexPath))) {
IndexSearcher searcher = new IndexSearcher(dirReader, Executors.newVirtualThreadPerTaskExecutor());
TopDocs topDocs = searcher.search(luceneQuery, 25, Sort.RELEVANCE, false);
List<PackageSearchResult> results = new ArrayList<>(25);
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
Document doc = searcher.storedFields().document(scoreDoc.doc);
results.add(prepareResult(doc));
}
return results;
} catch (IOException e) {
System.err.println("An IOException occurred while reading index: " + e.getMessage());
return Collections.emptyList();
}
}
/**
* Builds the Lucene search query for a given textual query string.
* @param queryText The query text to use.
* @return The query to use.
*/
private Query buildQuery(String queryText) {
BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder();
String[] searchTerms = queryText.toLowerCase().split("\\s+");
for (String searchTerm : searchTerms) {
String wildcardTerm = searchTerm + "*";
Query basicQuery = new WildcardQuery(new Term("name", wildcardTerm));
queryBuilder.add(new BoostQuery(basicQuery, 1f), BooleanClause.Occur.SHOULD);
}
return queryBuilder.build();
}
private PackageSearchResult prepareResult(Document doc) {
return new PackageSearchResult(
doc.get("name"),
doc.get("url")
);
}
}

View File

@ -1,43 +0,0 @@
package io.github.andrewlalis.dub_registry_search;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.time.Duration;
import java.util.zip.GZIPInputStream;
public class DubPackageFetcher implements PackageFetcher {
private final HttpClient httpClient = HttpClient.newBuilder()
.connectTimeout(Duration.ofSeconds(3))
.followRedirects(HttpClient.Redirect.NORMAL)
.build();
private static final String API_URL = "https://code.dlang.org/api/packages/dump";
@Override
public ArrayNode fetch() throws IOException {
HttpRequest req = HttpRequest.newBuilder(URI.create(API_URL))
.GET()
.timeout(Duration.ofSeconds(60))
.header("Accept", "application/json")
.header("Accept-Encoding", "gzip")
.build();
try {
HttpResponse<InputStream> response = httpClient.send(req, HttpResponse.BodyHandlers.ofInputStream());
if (response.statusCode() != 200) {
throw new IOException("Response status code " + response.statusCode());
}
ObjectMapper mapper = new ObjectMapper();
try (var in = new GZIPInputStream(response.body())) {
return mapper.readValue(in, ArrayNode.class);
}
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}

View File

@ -1,43 +0,0 @@
package io.github.andrewlalis.dub_registry_search;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import java.io.IOException;
import java.nio.file.Path;
import java.time.Duration;
import java.time.Instant;
public class DubRegistrySearch {
public static void main(String[] args) throws Exception {
if (args.length == 1 && args[0].strip().equalsIgnoreCase("index")) {
buildIndex();
}
}
public static void buildIndex() throws Exception {
System.out.println("Building package index.");
PackageFetcher fetcher = new DubPackageFetcher();
System.out.println("Fetching packages...");
ArrayNode packagesArray = fetcher.fetch();
int docCount = 0;
Duration indexDuration;
try (var indexer = new LucenePackageIndexer(Path.of("package-index"))) {
Instant start = Instant.now();
for (JsonNode node : packagesArray) {
if (node.isObject()) {
try {
indexer.addToIndex((ObjectNode) node);
docCount++;
} catch (IOException e) {
e.printStackTrace();
}
}
}
Instant end = Instant.now();
indexDuration = Duration.between(start, end);
}
System.out.println("Done! Added " + docCount + " packages to the index in " + indexDuration.toMillis() + " ms.");
}
}

View File

@ -1,9 +0,0 @@
package io.github.andrewlalis.dub_registry_search;
import com.fasterxml.jackson.databind.node.ArrayNode;
import java.io.IOException;
public interface PackageFetcher {
ArrayNode fetch() throws IOException;
}

View File

@ -1,9 +0,0 @@
package io.github.andrewlalis.dub_registry_search;
import com.fasterxml.jackson.databind.node.ObjectNode;
import java.io.IOException;
public interface PackageIndexer {
void addToIndex(ObjectNode packageJson) throws IOException;
}