feat: index ngrams to allow partial search

This commit is contained in:
Gauthier Roebroeck 2021-09-15 15:12:00 +08:00
parent 817c2939b0
commit 6e0c51ed1d
8 changed files with 146 additions and 21 deletions

View File

@ -55,5 +55,17 @@ class KomgaProperties {
class Lucene {
@get:NotBlank
var dataDirectory: String = ""
var indexAnalyzer = IndexAnalyzer()
class IndexAnalyzer {
@get:Positive
var minGram: Int = 3
@get:Positive
var maxGram: Int = 10
var preserveOriginal: Boolean = true
}
}
}

View File

@ -16,7 +16,13 @@ class LuceneConfiguration(
) {
@Bean
fun analyzer() =
fun indexAnalyzer() =
with(komgaProperties.lucene.indexAnalyzer) {
MultiLingualNGramAnalyzer(minGram, maxGram, preserveOriginal).apply { version = Version.LUCENE_8_9_0 }
}
@Bean
fun searchAnalyzer() =
MultiLingualAnalyzer().apply { version = Version.LUCENE_8_9_0 }
@Bean

View File

@ -21,10 +21,11 @@ private val logger = KotlinLogging.logger {}
@Component
class LuceneHelper(
private val directory: Directory,
private val analyzer: Analyzer,
private val indexAnalyzer: Analyzer,
private val searchAnalyzer: Analyzer,
) {
fun getIndexWriter() = IndexWriter(directory, IndexWriterConfig(analyzer))
fun getIndexWriter() = IndexWriter(directory, IndexWriterConfig(indexAnalyzer))
fun getIndexReader(): DirectoryReader = DirectoryReader.open(directory)
@ -33,7 +34,7 @@ class LuceneHelper(
fun searchEntitiesIds(searchTerm: String?, entity: LuceneEntity): List<String>? {
return if (!searchTerm.isNullOrBlank()) {
try {
val fieldsQuery = MultiFieldQueryParser(entity.defaultFields, analyzer).apply {
val fieldsQuery = MultiFieldQueryParser(entity.defaultFields, searchAnalyzer).apply {
defaultOperator = QueryParser.Operator.AND
}.parse(searchTerm)

View File

@ -9,7 +9,7 @@ import org.apache.lucene.analysis.cjk.CJKWidthFilter
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter
import org.apache.lucene.analysis.standard.StandardTokenizer
class MultiLingualAnalyzer : Analyzer() {
open class MultiLingualAnalyzer : Analyzer() {
override fun createComponents(fieldName: String): TokenStreamComponents {
val source: Tokenizer = StandardTokenizer()
// run the widthfilter first before bigramming, it sometimes combines characters.

View File

@ -0,0 +1,23 @@
package org.gotson.komga.infrastructure.search
import org.apache.lucene.analysis.LowerCaseFilter
import org.apache.lucene.analysis.TokenStream
import org.apache.lucene.analysis.Tokenizer
import org.apache.lucene.analysis.cjk.CJKBigramFilter
import org.apache.lucene.analysis.cjk.CJKWidthFilter
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter
import org.apache.lucene.analysis.ngram.NGramTokenFilter
import org.apache.lucene.analysis.standard.StandardTokenizer
class MultiLingualNGramAnalyzer(val minGram: Int, val maxGram: Int, val preserveOriginal: Boolean) : MultiLingualAnalyzer() {
override fun createComponents(fieldName: String): TokenStreamComponents {
val source: Tokenizer = StandardTokenizer()
// run the widthfilter first before bigramming, it sometimes combines characters.
var filter: TokenStream = CJKWidthFilter(source)
filter = LowerCaseFilter(filter)
filter = CJKBigramFilter(filter)
filter = NGramTokenFilter(filter, minGram, maxGram, preserveOriginal)
filter = ASCIIFoldingFilter(filter)
return TokenStreamComponents(source, filter)
}
}

View File

@ -1,6 +1,5 @@
package org.gotson.komga.infrastructure.search
import org.apache.lucene.analysis.Analyzer
import org.assertj.core.api.Assertions.assertThat
import org.junit.jupiter.api.Test
@ -8,21 +7,6 @@ class MultilingualAnalyzerTest {
private val analyzer = MultiLingualAnalyzer()
private fun Analyzer.getTokens(text: String): List<String> {
val tokenStream = tokenStream("text", text)
val tokens = mutableListOf<String>()
tokenStream.use { ts ->
ts.reset()
while (ts.incrementToken()) {
ts.reflectWith { _, key, value -> if (key == "term") tokens += value.toString() }
}
ts.end()
}
return tokens
}
@Test
fun `english text`() {
// given

View File

@ -0,0 +1,81 @@
package org.gotson.komga.infrastructure.search
import org.assertj.core.api.Assertions.assertThat
import org.junit.jupiter.api.Test
class MultilingualNGramAnalyzerTest {
@Test
fun `single letter`() {
// given
val text = "J"
// when
val tokens = MultiLingualNGramAnalyzer(3, 8, false).getTokens(text)
val tokensPreserveOriginal = MultiLingualNGramAnalyzer(3, 8, true).getTokens(text)
// then
assertThat(tokensPreserveOriginal).containsExactly("j")
assertThat(tokens).isEmpty()
}
@Test
fun `chinese mixed`() {
// given
val text = "[不道德公會][河添太一 ][東立]Vol.04-搬运"
// when
val tokens = MultiLingualNGramAnalyzer(3, 8, true).getTokens(text)
// then
assertThat(tokens).containsExactly("不道", "道德", "德公", "公會", "河添", "添太", "太一", "東立", "vol", "04", "搬运")
}
@Test
fun `chinese only`() {
// given
val text = "不道德公會河添太一東立搬运"
// when
val tokens = MultiLingualNGramAnalyzer(3, 8, true).getTokens(text)
// then
assertThat(tokens).containsExactly("不道", "道德", "德公", "公會", "會河", "河添", "添太", "太一", "一東", "東立", "立搬", "搬运")
}
@Test
fun `hiragana only`() {
// given
val text = "探偵はもう、死んでいる。"
// when
val tokens = MultiLingualNGramAnalyzer(3, 8, true).getTokens(text)
// then
assertThat(tokens).containsExactly("探偵", "偵は", "はも", "もう", "死ん", "んで", "でい", "いる")
}
@Test
fun `katakana only`() {
// given
val text = "ワンパンマン"
// when
val tokens = MultiLingualNGramAnalyzer(3, 8, true).getTokens(text)
// then
assertThat(tokens).containsExactly("ワン", "ンパ", "パン", "ンマ", "マン")
}
@Test
fun `korean only`() {
// given
val text = "고교생을 환불해 주세요"
// when
val tokens = MultiLingualNGramAnalyzer(3, 8, true).getTokens(text)
// then
assertThat(tokens).containsExactly("고교", "교생", "생을", "환불", "불해", "주세", "세요")
}
}

View File

@ -0,0 +1,18 @@
package org.gotson.komga.infrastructure.search
import org.apache.lucene.analysis.Analyzer
fun Analyzer.getTokens(text: String): List<String> {
val tokenStream = tokenStream("text", text)
val tokens = mutableListOf<String>()
tokenStream.use { ts ->
ts.reset()
while (ts.incrementToken()) {
ts.reflectWith { _, key, value -> if (key == "term") tokens += value.toString() }
}
ts.end()
}
return tokens
}