Added more complex indexing and search query.
This commit is contained in:
		
							parent
							
								
									ddc69c1c68
								
							
						
					
					
						commit
						07be6b3878
					
				| 
						 | 
					@ -5,3 +5,12 @@ An indexer and search API for D programming language packages as registered on h
 | 
				
			||||||
## Setup
 | 
					## Setup
 | 
				
			||||||
 | 
					
 | 
				
			||||||
To set up and run the program, all you need is Java version 21 or higher, and then run the project using your favorite IDE. It will boot up a web server that you can use to search for packages at http://localhost:8080/search?query=test, replacing `query=test` with what you want to search for.
 | 
					To set up and run the program, all you need is Java version 21 or higher, and then run the project using your favorite IDE. It will boot up a web server that you can use to search for packages at http://localhost:8080/search?query=test, replacing `query=test` with what you want to search for.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Architecture
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The basic architecture of this searcher is that of your classic indexed search engine, which is usually comprised of the following steps:
 | 
				
			||||||
 | 
					1. Fetch raw data from somewhere.
 | 
				
			||||||
 | 
					2. Generate an index from that data.
 | 
				
			||||||
 | 
					3. Search for relevant data using the index.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					In this application, steps 1 and 2 are done periodically in a separate thread, to ensure that the data stays relatively fresh. Step 3 is done whenever a request to the `/search` endpoint is received.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -28,7 +28,7 @@ public class DPackageSearch {
 | 
				
			||||||
			while (true) {
 | 
								while (true) {
 | 
				
			||||||
				indexGenerator.run();
 | 
									indexGenerator.run();
 | 
				
			||||||
				try {
 | 
									try {
 | 
				
			||||||
					Thread.sleep(Duration.ofMinutes(1));
 | 
										Thread.sleep(Duration.ofMinutes(5));
 | 
				
			||||||
				} catch (InterruptedException e) {
 | 
									} catch (InterruptedException e) {
 | 
				
			||||||
					System.err.println("Indexing thread interrupted: " + e.getMessage());
 | 
										System.err.println("Indexing thread interrupted: " + e.getMessage());
 | 
				
			||||||
					break;
 | 
										break;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -7,11 +7,15 @@ import java.time.LocalDateTime;
 | 
				
			||||||
 * @param name The name of the package.
 | 
					 * @param name The name of the package.
 | 
				
			||||||
 * @param categories The list of categories the package is in.
 | 
					 * @param categories The list of categories the package is in.
 | 
				
			||||||
 * @param versions The known list of versions for this package.
 | 
					 * @param versions The known list of versions for this package.
 | 
				
			||||||
 | 
					 * @param fetchedAt Timestamp for when this package was fetched exactly, so
 | 
				
			||||||
 | 
					 *                  that later indexing can use a unified timestamp for reference.
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
public record PackageInfo(
 | 
					public record PackageInfo(
 | 
				
			||||||
        String name,
 | 
					        String name,
 | 
				
			||||||
        String[] categories,
 | 
					        String[] categories,
 | 
				
			||||||
        VersionInfo[] versions
 | 
					        VersionInfo[] versions,
 | 
				
			||||||
 | 
					        long totalDownloads,
 | 
				
			||||||
 | 
					        LocalDateTime fetchedAt
 | 
				
			||||||
) {
 | 
					) {
 | 
				
			||||||
    /**
 | 
					    /**
 | 
				
			||||||
     * Information about a specific version of a D package.
 | 
					     * Information about a specific version of a D package.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -79,7 +79,7 @@ public final class WebApiRunner extends Handler.Abstract implements Runnable {
 | 
				
			||||||
            if (idx != -1) {
 | 
					            if (idx != -1) {
 | 
				
			||||||
                String key = pair.substring(0, idx);
 | 
					                String key = pair.substring(0, idx);
 | 
				
			||||||
                if (key.trim().equalsIgnoreCase("query")) {
 | 
					                if (key.trim().equalsIgnoreCase("query")) {
 | 
				
			||||||
                    return pair.substring(idx + 1).trim().toUpperCase();
 | 
					                    return pair.substring(idx + 1).trim().toLowerCase();
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -43,6 +43,7 @@ public class DubRegistryPackageFetcher implements PackageFetcher {
 | 
				
			||||||
			if (response.statusCode() != 200) {
 | 
								if (response.statusCode() != 200) {
 | 
				
			||||||
				throw new IOException("Response status code " + response.statusCode());
 | 
									throw new IOException("Response status code " + response.statusCode());
 | 
				
			||||||
			}
 | 
								}
 | 
				
			||||||
 | 
								LocalDateTime fetchedAt = LocalDateTime.now(ZoneOffset.UTC);
 | 
				
			||||||
			ObjectMapper mapper = new ObjectMapper();
 | 
								ObjectMapper mapper = new ObjectMapper();
 | 
				
			||||||
			try (var in = new GZIPInputStream(response.body())) {
 | 
								try (var in = new GZIPInputStream(response.body())) {
 | 
				
			||||||
				ArrayNode array = mapper.readValue(in, ArrayNode.class);
 | 
									ArrayNode array = mapper.readValue(in, ArrayNode.class);
 | 
				
			||||||
| 
						 | 
					@ -50,7 +51,7 @@ public class DubRegistryPackageFetcher implements PackageFetcher {
 | 
				
			||||||
				for (JsonNode node : array) {
 | 
									for (JsonNode node : array) {
 | 
				
			||||||
					if (node.isObject()) {
 | 
										if (node.isObject()) {
 | 
				
			||||||
						try {
 | 
											try {
 | 
				
			||||||
							packages.add(parsePackage((ObjectNode) node));
 | 
												packages.add(parsePackage((ObjectNode) node, fetchedAt));
 | 
				
			||||||
						} catch (Exception e) {
 | 
											} catch (Exception e) {
 | 
				
			||||||
							e.printStackTrace();
 | 
												e.printStackTrace();
 | 
				
			||||||
						}
 | 
											}
 | 
				
			||||||
| 
						 | 
					@ -63,11 +64,13 @@ public class DubRegistryPackageFetcher implements PackageFetcher {
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	private PackageInfo parsePackage(ObjectNode obj) {
 | 
						private PackageInfo parsePackage(ObjectNode obj, LocalDateTime fetchedAt) {
 | 
				
			||||||
		return new PackageInfo(
 | 
							return new PackageInfo(
 | 
				
			||||||
				obj.get("name").asText(),
 | 
									obj.get("name").asText(),
 | 
				
			||||||
				mapJsonArray(obj.withArray("categories"), JsonNode::asText).toArray(new String[0]),
 | 
									mapJsonArray(obj.withArray("categories"), JsonNode::asText).toArray(new String[0]),
 | 
				
			||||||
				mapJsonArray(obj.withArray("versions"), this::parseVersion).toArray(new PackageInfo.VersionInfo[0])
 | 
									mapJsonArray(obj.withArray("versions"), this::parseVersion).toArray(new PackageInfo.VersionInfo[0]),
 | 
				
			||||||
 | 
									obj.get("stats").get("downloads").get("total").asLong(),
 | 
				
			||||||
 | 
									fetchedAt
 | 
				
			||||||
		);
 | 
							);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -97,6 +100,13 @@ public class DubRegistryPackageFetcher implements PackageFetcher {
 | 
				
			||||||
		);
 | 
							);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/**
 | 
				
			||||||
 | 
						 * Maps a JSON array to a list of objects, using a mapping function.
 | 
				
			||||||
 | 
						 * @param array The JSON array.
 | 
				
			||||||
 | 
						 * @param mapper The mapper function to apply to each element of the array.
 | 
				
			||||||
 | 
						 * @return The mapped list of objects.
 | 
				
			||||||
 | 
						 * @param <T> The type of the resultant list elements.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
	private static <T> List<T> mapJsonArray(ArrayNode array, Function<JsonNode, T> mapper) {
 | 
						private static <T> List<T> mapJsonArray(ArrayNode array, Function<JsonNode, T> mapper) {
 | 
				
			||||||
		List<T> list = new ArrayList<>(array.size());
 | 
							List<T> list = new ArrayList<>(array.size());
 | 
				
			||||||
		for (JsonNode node : array) {
 | 
							for (JsonNode node : array) {
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,18 +4,23 @@ import com.andrewlalis.d_package_search.PackageIndexer;
 | 
				
			||||||
import com.andrewlalis.d_package_search.PackageInfo;
 | 
					import com.andrewlalis.d_package_search.PackageInfo;
 | 
				
			||||||
import org.apache.lucene.analysis.Analyzer;
 | 
					import org.apache.lucene.analysis.Analyzer;
 | 
				
			||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
 | 
					import org.apache.lucene.analysis.standard.StandardAnalyzer;
 | 
				
			||||||
import org.apache.lucene.document.Document;
 | 
					import org.apache.lucene.document.*;
 | 
				
			||||||
import org.apache.lucene.document.Field;
 | 
					 | 
				
			||||||
import org.apache.lucene.document.StoredField;
 | 
					 | 
				
			||||||
import org.apache.lucene.document.TextField;
 | 
					 | 
				
			||||||
import org.apache.lucene.index.IndexWriter;
 | 
					import org.apache.lucene.index.IndexWriter;
 | 
				
			||||||
import org.apache.lucene.index.IndexWriterConfig;
 | 
					import org.apache.lucene.index.IndexWriterConfig;
 | 
				
			||||||
 | 
					import org.apache.lucene.index.IndexableFieldType;
 | 
				
			||||||
import org.apache.lucene.store.Directory;
 | 
					import org.apache.lucene.store.Directory;
 | 
				
			||||||
import org.apache.lucene.store.FSDirectory;
 | 
					import org.apache.lucene.store.FSDirectory;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import java.io.IOException;
 | 
					import java.io.IOException;
 | 
				
			||||||
import java.nio.file.Path;
 | 
					import java.nio.file.Path;
 | 
				
			||||||
 | 
					import java.time.Duration;
 | 
				
			||||||
 | 
					import java.time.ZoneOffset;
 | 
				
			||||||
 | 
					import java.util.*;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/**
 | 
				
			||||||
 | 
					 * An indexer that produces a Lucene index, which is a directory, composed of
 | 
				
			||||||
 | 
					 * possibly many index segments.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
public class LucenePackageIndexer implements PackageIndexer {
 | 
					public class LucenePackageIndexer implements PackageIndexer {
 | 
				
			||||||
	private final IndexWriter indexWriter;
 | 
						private final IndexWriter indexWriter;
 | 
				
			||||||
	private final Directory dir;
 | 
						private final Directory dir;
 | 
				
			||||||
| 
						 | 
					@ -30,13 +35,70 @@ public class LucenePackageIndexer implements PackageIndexer {
 | 
				
			||||||
		this.indexWriter = new IndexWriter(dir, config);
 | 
							this.indexWriter = new IndexWriter(dir, config);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/**
 | 
				
			||||||
 | 
						 * Adds a package to the Lucene index. This is the central place where the
 | 
				
			||||||
 | 
						 * index's fields are defined. We define the following fields:
 | 
				
			||||||
 | 
						 * <ul>
 | 
				
			||||||
 | 
						 *     <li>name (text, stored)</li>
 | 
				
			||||||
 | 
						 *     <li>url (stored only)</li>
 | 
				
			||||||
 | 
						 *     <li>categories (multivalued string field with value for each category).</li>
 | 
				
			||||||
 | 
						 *     <li>latestVersionTimestamp (string field with date of latest version).</li>
 | 
				
			||||||
 | 
						 *     <li>description (optional text field)</li>
 | 
				
			||||||
 | 
						 *     <li>license (optional string field)</li>
 | 
				
			||||||
 | 
						 *     <li>readme (optional text field)</li>
 | 
				
			||||||
 | 
						 *     <li>
 | 
				
			||||||
 | 
						 *         features (feature field with the following features useful for scoring)
 | 
				
			||||||
 | 
						 *         <ul>
 | 
				
			||||||
 | 
						 *             <li>recency (0 - 1 value indicating how recent the package is)</li>
 | 
				
			||||||
 | 
						 *             <li>downloads (total downloads for the package)</li>
 | 
				
			||||||
 | 
						 *         </ul>
 | 
				
			||||||
 | 
						 *     </li>
 | 
				
			||||||
 | 
						 * </ul>
 | 
				
			||||||
 | 
						 * @param info The package to index.
 | 
				
			||||||
 | 
						 * @throws IOException If an error occurs.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
	@Override
 | 
						@Override
 | 
				
			||||||
	public void addToIndex(PackageInfo info) throws IOException {
 | 
						public void addToIndex(PackageInfo info) throws IOException {
 | 
				
			||||||
 | 
							if (info.versions().length == 0) {
 | 
				
			||||||
 | 
								System.out.println("Skipping package \"" + info.name() + "\" because there are no versions available.");
 | 
				
			||||||
 | 
								return;
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
							System.out.println("Indexing package \"" + info.name() + "\".");
 | 
				
			||||||
		String dubUrl = "https://code.dlang.org/packages/" + info.name();
 | 
							String dubUrl = "https://code.dlang.org/packages/" + info.name();
 | 
				
			||||||
 | 
							List<PackageInfo.VersionInfo> allVersions = new ArrayList<>(Arrays.asList(info.versions()));
 | 
				
			||||||
 | 
							allVersions.sort(Comparator.comparing(PackageInfo.VersionInfo::timestamp).reversed());
 | 
				
			||||||
 | 
							var recentVersions = allVersions.subList(0, Math.min(5, allVersions.size()));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		Document doc = new Document();
 | 
							Document doc = new Document();
 | 
				
			||||||
		doc.add(new TextField("name", info.name(), Field.Store.YES));
 | 
							doc.add(new TextField("name", info.name(), Field.Store.YES));
 | 
				
			||||||
		doc.add(new StoredField("url", dubUrl));
 | 
							doc.add(new StoredField("url", dubUrl));
 | 
				
			||||||
 | 
							for (String category : info.categories()) {
 | 
				
			||||||
 | 
								doc.add(new StringField("categories", category, Field.Store.NO));
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							PackageInfo.VersionInfo latestVersion = recentVersions.getFirst();
 | 
				
			||||||
 | 
							doc.add(new StringField(
 | 
				
			||||||
 | 
									"latestVersionTimestamp",
 | 
				
			||||||
 | 
									DateTools.dateToString(Date.from(latestVersion.timestamp().toInstant(ZoneOffset.UTC)), DateTools.Resolution.SECOND),
 | 
				
			||||||
 | 
									Field.Store.NO
 | 
				
			||||||
 | 
							));
 | 
				
			||||||
 | 
							if (latestVersion.description() != null) {
 | 
				
			||||||
 | 
								doc.add(new TextField("description", latestVersion.description(), Field.Store.NO));
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
							if (latestVersion.license() != null) {
 | 
				
			||||||
 | 
								doc.add(new StringField("license", latestVersion.license(), Field.Store.NO));
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
							if (latestVersion.readmeText() != null) {
 | 
				
			||||||
 | 
								doc.add(new TextField("readme", latestVersion.readmeText(), Field.Store.NO));
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							// Add FeatureFields to score packages based on some metrics.
 | 
				
			||||||
 | 
							int daysSinceUpdate = Math.clamp(Duration.between(latestVersion.timestamp(), info.fetchedAt()).toDays(), 1, 365 * 3);
 | 
				
			||||||
 | 
							float recency = 1f / daysSinceUpdate;
 | 
				
			||||||
 | 
							float downloadsScore = Math.clamp(info.totalDownloads(), 0.001f, Float.MAX_VALUE);
 | 
				
			||||||
 | 
							doc.add(new FeatureField("features", "recency", recency));
 | 
				
			||||||
 | 
							doc.add(new FeatureField("features", "downloads", downloadsScore));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		indexWriter.addDocument(doc);
 | 
							indexWriter.addDocument(doc);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,6 +3,7 @@ package com.andrewlalis.d_package_search.impl;
 | 
				
			||||||
import com.andrewlalis.d_package_search.PackageSearchResult;
 | 
					import com.andrewlalis.d_package_search.PackageSearchResult;
 | 
				
			||||||
import com.andrewlalis.d_package_search.PackageSearcher;
 | 
					import com.andrewlalis.d_package_search.PackageSearcher;
 | 
				
			||||||
import org.apache.lucene.document.Document;
 | 
					import org.apache.lucene.document.Document;
 | 
				
			||||||
 | 
					import org.apache.lucene.document.FeatureField;
 | 
				
			||||||
import org.apache.lucene.index.DirectoryReader;
 | 
					import org.apache.lucene.index.DirectoryReader;
 | 
				
			||||||
import org.apache.lucene.index.Term;
 | 
					import org.apache.lucene.index.Term;
 | 
				
			||||||
import org.apache.lucene.search.*;
 | 
					import org.apache.lucene.search.*;
 | 
				
			||||||
| 
						 | 
					@ -11,12 +12,13 @@ import org.apache.lucene.store.FSDirectory;
 | 
				
			||||||
import java.io.IOException;
 | 
					import java.io.IOException;
 | 
				
			||||||
import java.nio.file.Files;
 | 
					import java.nio.file.Files;
 | 
				
			||||||
import java.nio.file.Path;
 | 
					import java.nio.file.Path;
 | 
				
			||||||
import java.util.ArrayList;
 | 
					import java.util.*;
 | 
				
			||||||
import java.util.Collections;
 | 
					 | 
				
			||||||
import java.util.List;
 | 
					 | 
				
			||||||
import java.util.SequencedCollection;
 | 
					 | 
				
			||||||
import java.util.concurrent.Executors;
 | 
					import java.util.concurrent.Executors;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/**
 | 
				
			||||||
 | 
					 * A package searcher implementation that uses a weighted wildcard query to
 | 
				
			||||||
 | 
					 * search a Lucene index.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
public class LucenePackageSearcher implements PackageSearcher {
 | 
					public class LucenePackageSearcher implements PackageSearcher {
 | 
				
			||||||
    private final Path indexPath;
 | 
					    private final Path indexPath;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -52,12 +54,28 @@ public class LucenePackageSearcher implements PackageSearcher {
 | 
				
			||||||
    private Query buildQuery(String queryText) {
 | 
					    private Query buildQuery(String queryText) {
 | 
				
			||||||
        BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder();
 | 
					        BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder();
 | 
				
			||||||
        String[] searchTerms = queryText.toLowerCase().split("\\s+");
 | 
					        String[] searchTerms = queryText.toLowerCase().split("\\s+");
 | 
				
			||||||
        for (String searchTerm : searchTerms) {
 | 
					
 | 
				
			||||||
            String wildcardTerm = searchTerm + "*";
 | 
					        Map<String, Float> weightedFields = Map.of(
 | 
				
			||||||
            Query basicQuery = new WildcardQuery(new Term("name", wildcardTerm));
 | 
					                "name", 1f,
 | 
				
			||||||
            queryBuilder.add(new BoostQuery(basicQuery, 1f), BooleanClause.Occur.SHOULD);
 | 
					                "description", 0.5f,
 | 
				
			||||||
 | 
					                "readme", 0.25f
 | 
				
			||||||
 | 
					        );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for (int i = 0; i < Math.min(5, searchTerms.length); i++) {
 | 
				
			||||||
 | 
					            for (var entry : weightedFields.entrySet()) {
 | 
				
			||||||
 | 
					                String fieldName = entry.getKey();
 | 
				
			||||||
 | 
					                float fieldWeight = entry.getValue();
 | 
				
			||||||
 | 
					                Query termQuery = new BoostQuery(new PrefixQuery(new Term(fieldName, searchTerms[i])), fieldWeight);
 | 
				
			||||||
 | 
					                queryBuilder.add(termQuery, BooleanClause.Occur.SHOULD);
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
        return queryBuilder.build();
 | 
					        Query baseQuery = queryBuilder.build();
 | 
				
			||||||
 | 
					        Query boostedQuery = new BooleanQuery.Builder()
 | 
				
			||||||
 | 
					                .add(baseQuery, BooleanClause.Occur.MUST)
 | 
				
			||||||
 | 
					                .add(FeatureField.newSaturationQuery("features", "recency"), BooleanClause.Occur.SHOULD)
 | 
				
			||||||
 | 
					                .add(FeatureField.newSaturationQuery("features", "downloads"), BooleanClause.Occur.SHOULD)
 | 
				
			||||||
 | 
					                .build();
 | 
				
			||||||
 | 
					        return boostedQuery;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    private PackageSearchResult prepareResult(Document doc) {
 | 
					    private PackageSearchResult prepareResult(Document doc) {
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue