From db4774938a2fa891b0ac7f23ef278de855912af2 Mon Sep 17 00:00:00 2001 From: andrewlalis Date: Tue, 10 Oct 2023 13:51:54 -0400 Subject: [PATCH] Improved scoring, added more documentation to LucenePackageSearcher, added explanations to results. --- .../d_package_search/PackageSearchResult.java | 3 +- .../impl/LucenePackageSearcher.java | 59 +++++++++++++++---- src/main/resources/index.html | 18 ++++-- 3 files changed, 62 insertions(+), 18 deletions(-) diff --git a/src/main/java/com/andrewlalis/d_package_search/PackageSearchResult.java b/src/main/java/com/andrewlalis/d_package_search/PackageSearchResult.java index 518308c..a1e4890 100644 --- a/src/main/java/com/andrewlalis/d_package_search/PackageSearchResult.java +++ b/src/main/java/com/andrewlalis/d_package_search/PackageSearchResult.java @@ -2,5 +2,6 @@ package com.andrewlalis.d_package_search; public record PackageSearchResult( String name, - String url + String url, + String explanation ) {} diff --git a/src/main/java/com/andrewlalis/d_package_search/impl/LucenePackageSearcher.java b/src/main/java/com/andrewlalis/d_package_search/impl/LucenePackageSearcher.java index 43e576d..d15f068 100644 --- a/src/main/java/com/andrewlalis/d_package_search/impl/LucenePackageSearcher.java +++ b/src/main/java/com/andrewlalis/d_package_search/impl/LucenePackageSearcher.java @@ -20,6 +20,22 @@ import java.util.concurrent.Executors; * search a Lucene index. */ public class LucenePackageSearcher implements PackageSearcher { + /** + * Factor by which we prefer results containing the entire search phrase + * instead of just a part of it. + */ + private static final float PHRASE_WEIGHT_MODIFIER = 2f; + + /** + * A mapping of indexed fields, and the weight they contribute to a result's + * score, if the result contains a match for the field. + */ + private static final Map WEIGHTED_FIELDS = Map.of( + "name", 1f, + "description", 0.5f, + "readme", 0.25f + ); + private final Path indexPath; public LucenePackageSearcher(Path indexPath) { @@ -37,7 +53,11 @@ public class LucenePackageSearcher implements PackageSearcher { List results = new ArrayList<>(25); for (ScoreDoc scoreDoc : topDocs.scoreDocs) { Document doc = searcher.storedFields().document(scoreDoc.doc); - results.add(prepareResult(doc)); + results.add(prepareResult( + doc, + "Search result scoring explanation:\n" + + searcher.explain(luceneQuery, scoreDoc.doc).toString() + )); } return results; } catch (IOException e) { @@ -55,35 +75,48 @@ public class LucenePackageSearcher implements PackageSearcher { BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder(); String[] searchTerms = queryText.toLowerCase().split("\\s+"); - // We define a set of weighted fields that we will add prefix queries for. - Map weightedFields = Map.of( - "name", 1f, - "description", 0.5f, - "readme", 0.25f - ); - // Only consider the first 5 search terms, and add a prefix query for each term for them. for (int i = 0; i < Math.min(5, searchTerms.length); i++) { - for (var entry : weightedFields.entrySet()) { + for (var entry : WEIGHTED_FIELDS.entrySet()) { String fieldName = entry.getKey(); float fieldWeight = entry.getValue(); Query termQuery = new BoostQuery(new PrefixQuery(new Term(fieldName, searchTerms[i])), fieldWeight); queryBuilder.add(termQuery, BooleanClause.Occur.SHOULD); } } + + /* + If there's more than one word in the search query, put an extra emphasis + on finding a match with the entire query together. We use the PhraseQuery + builder to build an ordered phrase query for each of the weighted fields. + */ + if (searchTerms.length > 1) { + for (var entry : WEIGHTED_FIELDS.entrySet()) { + String fieldName = entry.getKey(); + float fieldWeight = entry.getValue(); + PhraseQuery.Builder phraseQueryBuilder = new PhraseQuery.Builder(); + for (int i = 0; i < searchTerms.length; i++) { + phraseQueryBuilder.add(new Term(fieldName, searchTerms[i]), i); + } + queryBuilder.add(new BoostQuery(phraseQueryBuilder.build(), fieldWeight * PHRASE_WEIGHT_MODIFIER), BooleanClause.Occur.SHOULD); + } + } + Query baseQuery = queryBuilder.build(); + System.out.println("Query: " + baseQuery.toString()); Query boostedQuery = new BooleanQuery.Builder() .add(baseQuery, BooleanClause.Occur.MUST) - .add(FeatureField.newSaturationQuery("features", "recency"), BooleanClause.Occur.SHOULD) - .add(FeatureField.newSaturationQuery("features", "downloads"), BooleanClause.Occur.SHOULD) + .add(FeatureField.newSaturationQuery("features", "recency", 0.25f, 1f/30f), BooleanClause.Occur.SHOULD) + .add(FeatureField.newSaturationQuery("features", "downloads", 0.5f, 500f), BooleanClause.Occur.SHOULD) .build(); return boostedQuery; } - private PackageSearchResult prepareResult(Document doc) { + private PackageSearchResult prepareResult(Document doc, String explanation) { return new PackageSearchResult( doc.get("name"), - doc.get("url") + doc.get("url"), + explanation ); } } diff --git a/src/main/resources/index.html b/src/main/resources/index.html index f3f7c46..de96d84 100644 --- a/src/main/resources/index.html +++ b/src/main/resources/index.html @@ -9,7 +9,7 @@

D Package Search

- Use this site to search for D packages! + Use this site to search for D packages. It's currently a proof-of-concept for using Lucene to index and search for D packages, since it offers a lot of out-of-the-box features for things like full-text indexing, scoring based on numerical features, and explaining exactly how results were found.

@@ -27,26 +27,36 @@ container.innerHTML = ""; for (let i = 0; i < results.length; i++) { const element = document.createElement("div"); + element.style = "border: 1px solid black; margin: 5px; padding: 5px;"; const header = document.createElement("h3"); header.innerText = results[i].name; element.appendChild(header); const link = document.createElement("a"); link.href = results[i].url; - link.innerText = "Link to package"; + link.innerText = results[i].url; element.appendChild(link); + const explanation = document.createElement("pre"); + explanation.innerText = results[i].explanation; + element.appendChild(explanation); container.appendChild(element); } } const searchInput = document.getElementById("search-input"); + let searchTimeoutId = null; searchInput.addEventListener("keyup", async () => { const query = searchInput.value; + if (searchTimeoutId) { + window.clearTimeout(searchTimeoutId); + } if (query.length < 1) { showResults([]); return; } - const results = await fetchResults(query); - showResults(results); + searchTimeoutId = window.setTimeout(async () => { + const results = await fetchResults(query); + showResults(results); + }, 1000); })