Added more complex indexing and search query.
This commit is contained in:
		
							parent
							
								
									ddc69c1c68
								
							
						
					
					
						commit
						07be6b3878
					
				| 
						 | 
				
			
			@ -5,3 +5,12 @@ An indexer and search API for D programming language packages as registered on h
 | 
			
		|||
## Setup
 | 
			
		||||
 | 
			
		||||
To set up and run the program, all you need is Java version 21 or higher, and then run the project using your favorite IDE. It will boot up a web server that you can use to search for packages at http://localhost:8080/search?query=test, replacing `query=test` with what you want to search for.
 | 
			
		||||
 | 
			
		||||
## Architecture
 | 
			
		||||
 | 
			
		||||
The basic architecture of this searcher is that of your classic indexed search engine, which is usually comprised of the following steps:
 | 
			
		||||
1. Fetch raw data from somewhere.
 | 
			
		||||
2. Generate an index from that data.
 | 
			
		||||
3. Search for relevant data using the index.
 | 
			
		||||
 | 
			
		||||
In this application, steps 1 and 2 are done periodically in a separate thread, to ensure that the data stays relatively fresh. Step 3 is done whenever a request to the `/search` endpoint is received.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -28,7 +28,7 @@ public class DPackageSearch {
 | 
			
		|||
			while (true) {
 | 
			
		||||
				indexGenerator.run();
 | 
			
		||||
				try {
 | 
			
		||||
					Thread.sleep(Duration.ofMinutes(1));
 | 
			
		||||
					Thread.sleep(Duration.ofMinutes(5));
 | 
			
		||||
				} catch (InterruptedException e) {
 | 
			
		||||
					System.err.println("Indexing thread interrupted: " + e.getMessage());
 | 
			
		||||
					break;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -7,11 +7,15 @@ import java.time.LocalDateTime;
 | 
			
		|||
 * @param name The name of the package.
 | 
			
		||||
 * @param categories The list of categories the package is in.
 | 
			
		||||
 * @param versions The known list of versions for this package.
 | 
			
		||||
 * @param fetchedAt Timestamp for when this package was fetched exactly, so
 | 
			
		||||
 *                  that later indexing can use a unified timestamp for reference.
 | 
			
		||||
 */
 | 
			
		||||
public record PackageInfo(
 | 
			
		||||
        String name,
 | 
			
		||||
        String[] categories,
 | 
			
		||||
        VersionInfo[] versions
 | 
			
		||||
        VersionInfo[] versions,
 | 
			
		||||
        long totalDownloads,
 | 
			
		||||
        LocalDateTime fetchedAt
 | 
			
		||||
) {
 | 
			
		||||
    /**
 | 
			
		||||
     * Information about a specific version of a D package.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -79,7 +79,7 @@ public final class WebApiRunner extends Handler.Abstract implements Runnable {
 | 
			
		|||
            if (idx != -1) {
 | 
			
		||||
                String key = pair.substring(0, idx);
 | 
			
		||||
                if (key.trim().equalsIgnoreCase("query")) {
 | 
			
		||||
                    return pair.substring(idx + 1).trim().toUpperCase();
 | 
			
		||||
                    return pair.substring(idx + 1).trim().toLowerCase();
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -43,6 +43,7 @@ public class DubRegistryPackageFetcher implements PackageFetcher {
 | 
			
		|||
			if (response.statusCode() != 200) {
 | 
			
		||||
				throw new IOException("Response status code " + response.statusCode());
 | 
			
		||||
			}
 | 
			
		||||
			LocalDateTime fetchedAt = LocalDateTime.now(ZoneOffset.UTC);
 | 
			
		||||
			ObjectMapper mapper = new ObjectMapper();
 | 
			
		||||
			try (var in = new GZIPInputStream(response.body())) {
 | 
			
		||||
				ArrayNode array = mapper.readValue(in, ArrayNode.class);
 | 
			
		||||
| 
						 | 
				
			
			@ -50,7 +51,7 @@ public class DubRegistryPackageFetcher implements PackageFetcher {
 | 
			
		|||
				for (JsonNode node : array) {
 | 
			
		||||
					if (node.isObject()) {
 | 
			
		||||
						try {
 | 
			
		||||
							packages.add(parsePackage((ObjectNode) node));
 | 
			
		||||
							packages.add(parsePackage((ObjectNode) node, fetchedAt));
 | 
			
		||||
						} catch (Exception e) {
 | 
			
		||||
							e.printStackTrace();
 | 
			
		||||
						}
 | 
			
		||||
| 
						 | 
				
			
			@ -63,11 +64,13 @@ public class DubRegistryPackageFetcher implements PackageFetcher {
 | 
			
		|||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	private PackageInfo parsePackage(ObjectNode obj) {
 | 
			
		||||
	private PackageInfo parsePackage(ObjectNode obj, LocalDateTime fetchedAt) {
 | 
			
		||||
		return new PackageInfo(
 | 
			
		||||
				obj.get("name").asText(),
 | 
			
		||||
				mapJsonArray(obj.withArray("categories"), JsonNode::asText).toArray(new String[0]),
 | 
			
		||||
				mapJsonArray(obj.withArray("versions"), this::parseVersion).toArray(new PackageInfo.VersionInfo[0])
 | 
			
		||||
				mapJsonArray(obj.withArray("versions"), this::parseVersion).toArray(new PackageInfo.VersionInfo[0]),
 | 
			
		||||
				obj.get("stats").get("downloads").get("total").asLong(),
 | 
			
		||||
				fetchedAt
 | 
			
		||||
		);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -97,6 +100,13 @@ public class DubRegistryPackageFetcher implements PackageFetcher {
 | 
			
		|||
		);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/**
 | 
			
		||||
	 * Maps a JSON array to a list of objects, using a mapping function.
 | 
			
		||||
	 * @param array The JSON array.
 | 
			
		||||
	 * @param mapper The mapper function to apply to each element of the array.
 | 
			
		||||
	 * @return The mapped list of objects.
 | 
			
		||||
	 * @param <T> The type of the resultant list elements.
 | 
			
		||||
	 */
 | 
			
		||||
	private static <T> List<T> mapJsonArray(ArrayNode array, Function<JsonNode, T> mapper) {
 | 
			
		||||
		List<T> list = new ArrayList<>(array.size());
 | 
			
		||||
		for (JsonNode node : array) {
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -4,18 +4,23 @@ import com.andrewlalis.d_package_search.PackageIndexer;
 | 
			
		|||
import com.andrewlalis.d_package_search.PackageInfo;
 | 
			
		||||
import org.apache.lucene.analysis.Analyzer;
 | 
			
		||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
 | 
			
		||||
import org.apache.lucene.document.Document;
 | 
			
		||||
import org.apache.lucene.document.Field;
 | 
			
		||||
import org.apache.lucene.document.StoredField;
 | 
			
		||||
import org.apache.lucene.document.TextField;
 | 
			
		||||
import org.apache.lucene.document.*;
 | 
			
		||||
import org.apache.lucene.index.IndexWriter;
 | 
			
		||||
import org.apache.lucene.index.IndexWriterConfig;
 | 
			
		||||
import org.apache.lucene.index.IndexableFieldType;
 | 
			
		||||
import org.apache.lucene.store.Directory;
 | 
			
		||||
import org.apache.lucene.store.FSDirectory;
 | 
			
		||||
 | 
			
		||||
import java.io.IOException;
 | 
			
		||||
import java.nio.file.Path;
 | 
			
		||||
import java.time.Duration;
 | 
			
		||||
import java.time.ZoneOffset;
 | 
			
		||||
import java.util.*;
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * An indexer that produces a Lucene index, which is a directory, composed of
 | 
			
		||||
 * possibly many index segments.
 | 
			
		||||
 */
 | 
			
		||||
public class LucenePackageIndexer implements PackageIndexer {
 | 
			
		||||
	private final IndexWriter indexWriter;
 | 
			
		||||
	private final Directory dir;
 | 
			
		||||
| 
						 | 
				
			
			@ -30,13 +35,70 @@ public class LucenePackageIndexer implements PackageIndexer {
 | 
			
		|||
		this.indexWriter = new IndexWriter(dir, config);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/**
 | 
			
		||||
	 * Adds a package to the Lucene index. This is the central place where the
 | 
			
		||||
	 * index's fields are defined. We define the following fields:
 | 
			
		||||
	 * <ul>
 | 
			
		||||
	 *     <li>name (text, stored)</li>
 | 
			
		||||
	 *     <li>url (stored only)</li>
 | 
			
		||||
	 *     <li>categories (multivalued string field with value for each category).</li>
 | 
			
		||||
	 *     <li>latestVersionTimestamp (string field with date of latest version).</li>
 | 
			
		||||
	 *     <li>description (optional text field)</li>
 | 
			
		||||
	 *     <li>license (optional string field)</li>
 | 
			
		||||
	 *     <li>readme (optional text field)</li>
 | 
			
		||||
	 *     <li>
 | 
			
		||||
	 *         features (feature field with the following features useful for scoring)
 | 
			
		||||
	 *         <ul>
 | 
			
		||||
	 *             <li>recency (0 - 1 value indicating how recent the package is)</li>
 | 
			
		||||
	 *             <li>downloads (total downloads for the package)</li>
 | 
			
		||||
	 *         </ul>
 | 
			
		||||
	 *     </li>
 | 
			
		||||
	 * </ul>
 | 
			
		||||
	 * @param info The package to index.
 | 
			
		||||
	 * @throws IOException If an error occurs.
 | 
			
		||||
	 */
 | 
			
		||||
	@Override
 | 
			
		||||
	public void addToIndex(PackageInfo info) throws IOException {
 | 
			
		||||
		if (info.versions().length == 0) {
 | 
			
		||||
			System.out.println("Skipping package \"" + info.name() + "\" because there are no versions available.");
 | 
			
		||||
			return;
 | 
			
		||||
		}
 | 
			
		||||
		System.out.println("Indexing package \"" + info.name() + "\".");
 | 
			
		||||
		String dubUrl = "https://code.dlang.org/packages/" + info.name();
 | 
			
		||||
		List<PackageInfo.VersionInfo> allVersions = new ArrayList<>(Arrays.asList(info.versions()));
 | 
			
		||||
		allVersions.sort(Comparator.comparing(PackageInfo.VersionInfo::timestamp).reversed());
 | 
			
		||||
		var recentVersions = allVersions.subList(0, Math.min(5, allVersions.size()));
 | 
			
		||||
 | 
			
		||||
		Document doc = new Document();
 | 
			
		||||
		doc.add(new TextField("name", info.name(), Field.Store.YES));
 | 
			
		||||
		doc.add(new StoredField("url", dubUrl));
 | 
			
		||||
		for (String category : info.categories()) {
 | 
			
		||||
			doc.add(new StringField("categories", category, Field.Store.NO));
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		PackageInfo.VersionInfo latestVersion = recentVersions.getFirst();
 | 
			
		||||
		doc.add(new StringField(
 | 
			
		||||
				"latestVersionTimestamp",
 | 
			
		||||
				DateTools.dateToString(Date.from(latestVersion.timestamp().toInstant(ZoneOffset.UTC)), DateTools.Resolution.SECOND),
 | 
			
		||||
				Field.Store.NO
 | 
			
		||||
		));
 | 
			
		||||
		if (latestVersion.description() != null) {
 | 
			
		||||
			doc.add(new TextField("description", latestVersion.description(), Field.Store.NO));
 | 
			
		||||
		}
 | 
			
		||||
		if (latestVersion.license() != null) {
 | 
			
		||||
			doc.add(new StringField("license", latestVersion.license(), Field.Store.NO));
 | 
			
		||||
		}
 | 
			
		||||
		if (latestVersion.readmeText() != null) {
 | 
			
		||||
			doc.add(new TextField("readme", latestVersion.readmeText(), Field.Store.NO));
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		// Add FeatureFields to score packages based on some metrics.
 | 
			
		||||
		int daysSinceUpdate = Math.clamp(Duration.between(latestVersion.timestamp(), info.fetchedAt()).toDays(), 1, 365 * 3);
 | 
			
		||||
		float recency = 1f / daysSinceUpdate;
 | 
			
		||||
		float downloadsScore = Math.clamp(info.totalDownloads(), 0.001f, Float.MAX_VALUE);
 | 
			
		||||
		doc.add(new FeatureField("features", "recency", recency));
 | 
			
		||||
		doc.add(new FeatureField("features", "downloads", downloadsScore));
 | 
			
		||||
 | 
			
		||||
		indexWriter.addDocument(doc);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -3,6 +3,7 @@ package com.andrewlalis.d_package_search.impl;
 | 
			
		|||
import com.andrewlalis.d_package_search.PackageSearchResult;
 | 
			
		||||
import com.andrewlalis.d_package_search.PackageSearcher;
 | 
			
		||||
import org.apache.lucene.document.Document;
 | 
			
		||||
import org.apache.lucene.document.FeatureField;
 | 
			
		||||
import org.apache.lucene.index.DirectoryReader;
 | 
			
		||||
import org.apache.lucene.index.Term;
 | 
			
		||||
import org.apache.lucene.search.*;
 | 
			
		||||
| 
						 | 
				
			
			@ -11,12 +12,13 @@ import org.apache.lucene.store.FSDirectory;
 | 
			
		|||
import java.io.IOException;
 | 
			
		||||
import java.nio.file.Files;
 | 
			
		||||
import java.nio.file.Path;
 | 
			
		||||
import java.util.ArrayList;
 | 
			
		||||
import java.util.Collections;
 | 
			
		||||
import java.util.List;
 | 
			
		||||
import java.util.SequencedCollection;
 | 
			
		||||
import java.util.*;
 | 
			
		||||
import java.util.concurrent.Executors;
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * A package searcher implementation that uses a weighted wildcard query to
 | 
			
		||||
 * search a Lucene index.
 | 
			
		||||
 */
 | 
			
		||||
public class LucenePackageSearcher implements PackageSearcher {
 | 
			
		||||
    private final Path indexPath;
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -52,12 +54,28 @@ public class LucenePackageSearcher implements PackageSearcher {
 | 
			
		|||
    private Query buildQuery(String queryText) {
 | 
			
		||||
        BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder();
 | 
			
		||||
        String[] searchTerms = queryText.toLowerCase().split("\\s+");
 | 
			
		||||
        for (String searchTerm : searchTerms) {
 | 
			
		||||
            String wildcardTerm = searchTerm + "*";
 | 
			
		||||
            Query basicQuery = new WildcardQuery(new Term("name", wildcardTerm));
 | 
			
		||||
            queryBuilder.add(new BoostQuery(basicQuery, 1f), BooleanClause.Occur.SHOULD);
 | 
			
		||||
 | 
			
		||||
        Map<String, Float> weightedFields = Map.of(
 | 
			
		||||
                "name", 1f,
 | 
			
		||||
                "description", 0.5f,
 | 
			
		||||
                "readme", 0.25f
 | 
			
		||||
        );
 | 
			
		||||
 | 
			
		||||
        for (int i = 0; i < Math.min(5, searchTerms.length); i++) {
 | 
			
		||||
            for (var entry : weightedFields.entrySet()) {
 | 
			
		||||
                String fieldName = entry.getKey();
 | 
			
		||||
                float fieldWeight = entry.getValue();
 | 
			
		||||
                Query termQuery = new BoostQuery(new PrefixQuery(new Term(fieldName, searchTerms[i])), fieldWeight);
 | 
			
		||||
                queryBuilder.add(termQuery, BooleanClause.Occur.SHOULD);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        return queryBuilder.build();
 | 
			
		||||
        Query baseQuery = queryBuilder.build();
 | 
			
		||||
        Query boostedQuery = new BooleanQuery.Builder()
 | 
			
		||||
                .add(baseQuery, BooleanClause.Occur.MUST)
 | 
			
		||||
                .add(FeatureField.newSaturationQuery("features", "recency"), BooleanClause.Occur.SHOULD)
 | 
			
		||||
                .add(FeatureField.newSaturationQuery("features", "downloads"), BooleanClause.Occur.SHOULD)
 | 
			
		||||
                .build();
 | 
			
		||||
        return boostedQuery;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    private PackageSearchResult prepareResult(Document doc) {
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue