From eb4a66e039a2c796e3fa51f6c2335a3bf834acb3 Mon Sep 17 00:00:00 2001 From: Andrew Lalis Date: Sun, 19 Mar 2023 09:59:42 +0100 Subject: [PATCH] Added initial index impl. --- .gitignore | 4 ++ pom.xml | 33 ++++++++++++ .../DubPackageFetcher.java | 43 ++++++++++++++++ .../DubRegistrySearch.java | 43 ++++++++++++++++ .../LucenePackageIndexer.java | 51 +++++++++++++++++++ .../dub_registry_search/PackageFetcher.java | 9 ++++ .../dub_registry_search/PackageIndexer.java | 9 ++++ 7 files changed, 192 insertions(+) create mode 100644 .gitignore create mode 100644 pom.xml create mode 100644 src/main/java/io/github/andrewlalis/dub_registry_search/DubPackageFetcher.java create mode 100644 src/main/java/io/github/andrewlalis/dub_registry_search/DubRegistrySearch.java create mode 100644 src/main/java/io/github/andrewlalis/dub_registry_search/LucenePackageIndexer.java create mode 100644 src/main/java/io/github/andrewlalis/dub_registry_search/PackageFetcher.java create mode 100644 src/main/java/io/github/andrewlalis/dub_registry_search/PackageIndexer.java diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6576de5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +*.iml +.idea/ +target/ +package-index/ \ No newline at end of file diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..b991d90 --- /dev/null +++ b/pom.xml @@ -0,0 +1,33 @@ + + + 4.0.0 + + io.github.andrewlalis + dub-registry-search + 1.0.0-SNAPSHOT + + + 17 + 17 + UTF-8 + + + + + + org.apache.lucene + lucene-core + 9.5.0 + + + + + com.fasterxml.jackson.core + jackson-databind + 2.14.2 + + + + \ No newline at end of file diff --git a/src/main/java/io/github/andrewlalis/dub_registry_search/DubPackageFetcher.java b/src/main/java/io/github/andrewlalis/dub_registry_search/DubPackageFetcher.java new file mode 100644 index 0000000..a4fb6c5 --- /dev/null +++ b/src/main/java/io/github/andrewlalis/dub_registry_search/DubPackageFetcher.java @@ -0,0 +1,43 @@ +package io.github.andrewlalis.dub_registry_search; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ArrayNode; + +import java.io.IOException; +import java.io.InputStream; +import java.net.URI; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.time.Duration; +import java.util.zip.GZIPInputStream; + +public class DubPackageFetcher implements PackageFetcher { + private final HttpClient httpClient = HttpClient.newBuilder() + .connectTimeout(Duration.ofSeconds(3)) + .followRedirects(HttpClient.Redirect.NORMAL) + .build(); + private static final String API_URL = "https://code.dlang.org/api/packages/dump"; + + @Override + public ArrayNode fetch() throws IOException { + HttpRequest req = HttpRequest.newBuilder(URI.create(API_URL)) + .GET() + .timeout(Duration.ofSeconds(60)) + .header("Accept", "application/json") + .header("Accept-Encoding", "gzip") + .build(); + try { + HttpResponse response = httpClient.send(req, HttpResponse.BodyHandlers.ofInputStream()); + if (response.statusCode() != 200) { + throw new IOException("Response status code " + response.statusCode()); + } + ObjectMapper mapper = new ObjectMapper(); + try (var in = new GZIPInputStream(response.body())) { + return mapper.readValue(in, ArrayNode.class); + } + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + } +} diff --git a/src/main/java/io/github/andrewlalis/dub_registry_search/DubRegistrySearch.java b/src/main/java/io/github/andrewlalis/dub_registry_search/DubRegistrySearch.java new file mode 100644 index 0000000..72631d9 --- /dev/null +++ b/src/main/java/io/github/andrewlalis/dub_registry_search/DubRegistrySearch.java @@ -0,0 +1,43 @@ +package io.github.andrewlalis.dub_registry_search; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.node.ArrayNode; +import com.fasterxml.jackson.databind.node.ObjectNode; + +import java.io.IOException; +import java.nio.file.Path; +import java.time.Duration; +import java.time.Instant; + +public class DubRegistrySearch { + public static void main(String[] args) throws Exception { + if (args.length == 1 && args[0].strip().equalsIgnoreCase("index")) { + buildIndex(); + } + } + + public static void buildIndex() throws Exception { + System.out.println("Building package index."); + PackageFetcher fetcher = new DubPackageFetcher(); + System.out.println("Fetching packages..."); + ArrayNode packagesArray = fetcher.fetch(); + int docCount = 0; + Duration indexDuration; + try (var indexer = new LucenePackageIndexer(Path.of("package-index"))) { + Instant start = Instant.now(); + for (JsonNode node : packagesArray) { + if (node.isObject()) { + try { + indexer.addToIndex((ObjectNode) node); + docCount++; + } catch (IOException e) { + e.printStackTrace(); + } + } + } + Instant end = Instant.now(); + indexDuration = Duration.between(start, end); + } + System.out.println("Done! Added " + docCount + " packages to the index in " + indexDuration.toMillis() + " ms."); + } +} diff --git a/src/main/java/io/github/andrewlalis/dub_registry_search/LucenePackageIndexer.java b/src/main/java/io/github/andrewlalis/dub_registry_search/LucenePackageIndexer.java new file mode 100644 index 0000000..950a3c6 --- /dev/null +++ b/src/main/java/io/github/andrewlalis/dub_registry_search/LucenePackageIndexer.java @@ -0,0 +1,51 @@ +package io.github.andrewlalis.dub_registry_search; + +import com.fasterxml.jackson.databind.node.ObjectNode; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.StoredField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; + +import java.io.IOException; +import java.nio.file.Path; + +public class LucenePackageIndexer implements PackageIndexer, AutoCloseable { + private final IndexWriter indexWriter; + private final Directory dir; + private final Analyzer analyzer; + + public LucenePackageIndexer(Path indexPath) throws IOException { + this.dir = FSDirectory.open(indexPath); + this.analyzer = new StandardAnalyzer(); + IndexWriterConfig config = new IndexWriterConfig(analyzer); + config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); + config.setCommitOnClose(true); + this.indexWriter = new IndexWriter(dir, config); + } + + + @Override + public void addToIndex(ObjectNode packageJson) throws IOException { + String registryId = packageJson.get("_id").asText(); + String name = packageJson.get("name").asText(); + String dubUrl = "https://code.dlang.org/packages/" + name; + + Document doc = new Document(); + doc.add(new StoredField("registryId", registryId)); + doc.add(new TextField("name", name, Field.Store.YES)); + doc.add(new StoredField("dubUrl", dubUrl)); + } + + @Override + public void close() throws Exception { + indexWriter.close(); + analyzer.close(); + dir.close(); + } +} diff --git a/src/main/java/io/github/andrewlalis/dub_registry_search/PackageFetcher.java b/src/main/java/io/github/andrewlalis/dub_registry_search/PackageFetcher.java new file mode 100644 index 0000000..3eca6a6 --- /dev/null +++ b/src/main/java/io/github/andrewlalis/dub_registry_search/PackageFetcher.java @@ -0,0 +1,9 @@ +package io.github.andrewlalis.dub_registry_search; + +import com.fasterxml.jackson.databind.node.ArrayNode; + +import java.io.IOException; + +public interface PackageFetcher { + ArrayNode fetch() throws IOException; +} diff --git a/src/main/java/io/github/andrewlalis/dub_registry_search/PackageIndexer.java b/src/main/java/io/github/andrewlalis/dub_registry_search/PackageIndexer.java new file mode 100644 index 0000000..1f5b0cc --- /dev/null +++ b/src/main/java/io/github/andrewlalis/dub_registry_search/PackageIndexer.java @@ -0,0 +1,9 @@ +package io.github.andrewlalis.dub_registry_search; + +import com.fasterxml.jackson.databind.node.ObjectNode; + +import java.io.IOException; + +public interface PackageIndexer { + void addToIndex(ObjectNode packageJson) throws IOException; +}