Added implementation.

This commit is contained in:
Andrew Lalis 2023-02-06 22:51:16 +01:00
parent 1ca30f1c06
commit c91c116d4e
6 changed files with 74889 additions and 0 deletions

4
.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
target/
.idea/
airports-index/
*.iml

74622
airports.csv Normal file

File diff suppressed because it is too large Load Diff

31
pom.xml Normal file
View File

@ -0,0 +1,31 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>io.github.andrewlalis</groupId>
<artifactId>SampleLuceneSearch</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>17</maven.compiler.source>
<maven.compiler.target>17</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-core -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>9.5.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-csv -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-csv</artifactId>
<version>1.10.0</version>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,24 @@
package io.github.andrewlalis.sample_lucene_search;
import java.util.Optional;
public record Airport(
long id,
String ident,
String type,
String name,
double latitude,
double longitude,
Optional<Integer> elevationFt,
String continent,
String isoCountry,
String isoRegion,
String municipality,
boolean scheduledService,
Optional<String> gpsCode,
Optional<String> iataCode,
Optional<String> localCode,
Optional<String> homeLink,
Optional<String> wikipediaLink,
Optional<String> keywords
) {}

View File

@ -0,0 +1,66 @@
package io.github.andrewlalis.sample_lucene_search;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVRecord;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
public final class AirportParser {
private AirportParser() {}
public static List<Airport> parseAirports(Path filePath) {
CSVFormat format = CSVFormat.DEFAULT.builder()
.setHeader()
.setSkipHeaderRecord(true)
.build();
try (
var reader = Files.newBufferedReader(filePath);
var parser = format.parse(reader)
) {
var it = parser.iterator();
List<Airport> airports = new ArrayList<>();
while (it.hasNext()) {
airports.add(parseAirport(it.next()));
}
return airports;
} catch (IOException e) {
System.err.println("Error reading airports.");
e.printStackTrace();
return new ArrayList<>();
}
}
private static Airport parseAirport(CSVRecord r) {
return new Airport(
Long.parseLong(r.get("id")),
r.get("ident"),
r.get("type"),
r.get("name"),
Double.parseDouble(r.get("latitude_deg")),
Double.parseDouble(r.get("longitude_deg")),
getOptionalString(r, "elevation_ft").map(Integer::parseInt),
r.get("continent"),
r.get("iso_country"),
r.get("iso_region"),
r.get("municipality"),
r.get("scheduled_service").equalsIgnoreCase("yes"),
getOptionalString(r, "gps_code"),
getOptionalString(r, "iata_code"),
getOptionalString(r, "local_code"),
getOptionalString(r, "home_link"),
getOptionalString(r, "wikipedia_link"),
getOptionalString(r, "keywords")
);
}
private static Optional<String> getOptionalString(CSVRecord r, String key) {
String value = r.get(key);
if (value.isBlank()) value = null;
return Optional.ofNullable(value);
}
}

View File

@ -0,0 +1,142 @@
package io.github.andrewlalis.sample_lucene_search;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
public class SampleSearch {
public static void main(String[] args) throws IOException {
List<Airport> airports = AirportParser.parseAirports(Path.of("airports.csv"));
System.out.println("Read " + airports.size() + " airports.");
buildIndex(airports);
System.out.println("Built index.");
System.out.println("Entering search-cli mode. Type a query.");
BufferedReader reader = new BufferedReader(new InputStreamReader(System.in));
String line;
while ((line = reader.readLine()) != null) {
String rawQuery = line.strip().toLowerCase();
if (rawQuery.equals("exit")) break;
var results = searchAirports(rawQuery);
int i = 1;
for (var name : results) {
System.out.println(" " + i++ + ". " + name);
}
}
System.out.println("Done!");
}
public static void buildIndex(List<Airport> airports) throws IOException {
Path indexDir = Path.of("airports-index");
deleteDirRecursive(indexDir);
Files.createDirectories(indexDir);
try (
Analyzer analyzer = new StandardAnalyzer();
Directory luceneDir = FSDirectory.open(indexDir);
IndexWriter indexWriter = new IndexWriter(luceneDir, new IndexWriterConfig(analyzer))
) {
for (var airport : airports) {
Document doc = new Document();
doc.add(new StoredField("id", airport.id()));
doc.add(new TextField("ident", airport.ident(), Field.Store.YES));
doc.add(new TextField("type", airport.type(), Field.Store.YES));
doc.add(new TextField("name", airport.name(), Field.Store.YES));
doc.add(new TextField("continent", airport.continent(), Field.Store.YES));
doc.add(new TextField("isoCountry", airport.isoCountry(), Field.Store.YES));
doc.add(new TextField("municipality", airport.municipality(), Field.Store.YES));
doc.add(new IntPoint("elevationFt", airport.elevationFt().orElse(0)));
doc.add(new StoredField("elevationFt", airport.elevationFt().orElse(0)));
if (airport.wikipediaLink().isPresent()) {
doc.add(new StoredField("wikipediaLink", airport.wikipediaLink().get()));
}
indexWriter.addDocument(doc);
}
}
}
public static List<String> searchAirports(String rawQuery) {
Path indexDir = Path.of("airports-index");
// If the query is empty or there's no index, quit right away.
if (rawQuery == null || rawQuery.isBlank() || Files.notExists(indexDir)) return new ArrayList<>();
// Prepare a weight for each of the fields we want to search on.
Map<String, Float> fieldWeights = Map.of(
"name", 3f,
"municipality", 2f,
"ident", 2f,
"type", 1f,
"continent", 0.25f
);
// Build a boolean query made up of "boosted" wildcard term queries, that'll match any term.
BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder();
String[] terms = rawQuery.toLowerCase().split("\\s+");
for (String term : terms) {
String wildcardTerm = term + "*";
for (var entry : fieldWeights.entrySet()) {
String fieldName = entry.getKey();
float weight = entry.getValue();
Query baseQuery = new WildcardQuery(new Term(fieldName, wildcardTerm));
queryBuilder.add(new BoostQuery(baseQuery, weight), BooleanClause.Occur.SHOULD);
}
}
Query query = queryBuilder.build();
// Use the query we built to fetch up to 10 results.
try (var reader = DirectoryReader.open(FSDirectory.open(indexDir))) {
IndexSearcher searcher = new IndexSearcher(reader);
List<String> results = new ArrayList<>(10);
TopDocs topDocs = searcher.search(query, 10, Sort.RELEVANCE, false);
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
Document doc = searcher.storedFields().document(scoreDoc.doc);
results.add(doc.get("name"));
}
return results;
} catch (IOException e) {
System.err.println("Failed to search index.");
e.printStackTrace();
return new ArrayList<>();
}
}
/**
* Helper function that removes a directory and its contents recursively.
* @param dir The directory to remove.
* @throws IOException If an error occurs.
*/
private static void deleteDirRecursive(Path dir) throws IOException {
if (Files.notExists(dir)) return;
Files.walkFileTree(dir, new SimpleFileVisitor<>() {
@Override
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
Files.delete(file);
return FileVisitResult.CONTINUE;
}
@Override
public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException {
Files.delete(dir);
return FileVisitResult.CONTINUE;
}
});
}
}