mirror of
https://github.com/gotson/komga.git
synced 2025-01-09 04:08:00 +08:00
feat: index ngrams to allow partial search
This commit is contained in:
parent
817c2939b0
commit
6e0c51ed1d
@ -55,5 +55,17 @@ class KomgaProperties {
|
||||
class Lucene {
|
||||
@get:NotBlank
|
||||
var dataDirectory: String = ""
|
||||
|
||||
var indexAnalyzer = IndexAnalyzer()
|
||||
|
||||
class IndexAnalyzer {
|
||||
@get:Positive
|
||||
var minGram: Int = 3
|
||||
|
||||
@get:Positive
|
||||
var maxGram: Int = 10
|
||||
|
||||
var preserveOriginal: Boolean = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -16,7 +16,13 @@ class LuceneConfiguration(
|
||||
) {
|
||||
|
||||
@Bean
|
||||
fun analyzer() =
|
||||
fun indexAnalyzer() =
|
||||
with(komgaProperties.lucene.indexAnalyzer) {
|
||||
MultiLingualNGramAnalyzer(minGram, maxGram, preserveOriginal).apply { version = Version.LUCENE_8_9_0 }
|
||||
}
|
||||
|
||||
@Bean
|
||||
fun searchAnalyzer() =
|
||||
MultiLingualAnalyzer().apply { version = Version.LUCENE_8_9_0 }
|
||||
|
||||
@Bean
|
||||
|
@ -21,10 +21,11 @@ private val logger = KotlinLogging.logger {}
|
||||
@Component
|
||||
class LuceneHelper(
|
||||
private val directory: Directory,
|
||||
private val analyzer: Analyzer,
|
||||
private val indexAnalyzer: Analyzer,
|
||||
private val searchAnalyzer: Analyzer,
|
||||
) {
|
||||
|
||||
fun getIndexWriter() = IndexWriter(directory, IndexWriterConfig(analyzer))
|
||||
fun getIndexWriter() = IndexWriter(directory, IndexWriterConfig(indexAnalyzer))
|
||||
|
||||
fun getIndexReader(): DirectoryReader = DirectoryReader.open(directory)
|
||||
|
||||
@ -33,7 +34,7 @@ class LuceneHelper(
|
||||
fun searchEntitiesIds(searchTerm: String?, entity: LuceneEntity): List<String>? {
|
||||
return if (!searchTerm.isNullOrBlank()) {
|
||||
try {
|
||||
val fieldsQuery = MultiFieldQueryParser(entity.defaultFields, analyzer).apply {
|
||||
val fieldsQuery = MultiFieldQueryParser(entity.defaultFields, searchAnalyzer).apply {
|
||||
defaultOperator = QueryParser.Operator.AND
|
||||
}.parse(searchTerm)
|
||||
|
||||
|
@ -9,7 +9,7 @@ import org.apache.lucene.analysis.cjk.CJKWidthFilter
|
||||
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer
|
||||
|
||||
class MultiLingualAnalyzer : Analyzer() {
|
||||
open class MultiLingualAnalyzer : Analyzer() {
|
||||
override fun createComponents(fieldName: String): TokenStreamComponents {
|
||||
val source: Tokenizer = StandardTokenizer()
|
||||
// run the widthfilter first before bigramming, it sometimes combines characters.
|
||||
|
@ -0,0 +1,23 @@
|
||||
package org.gotson.komga.infrastructure.search
|
||||
|
||||
import org.apache.lucene.analysis.LowerCaseFilter
|
||||
import org.apache.lucene.analysis.TokenStream
|
||||
import org.apache.lucene.analysis.Tokenizer
|
||||
import org.apache.lucene.analysis.cjk.CJKBigramFilter
|
||||
import org.apache.lucene.analysis.cjk.CJKWidthFilter
|
||||
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter
|
||||
import org.apache.lucene.analysis.ngram.NGramTokenFilter
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer
|
||||
|
||||
class MultiLingualNGramAnalyzer(val minGram: Int, val maxGram: Int, val preserveOriginal: Boolean) : MultiLingualAnalyzer() {
|
||||
override fun createComponents(fieldName: String): TokenStreamComponents {
|
||||
val source: Tokenizer = StandardTokenizer()
|
||||
// run the widthfilter first before bigramming, it sometimes combines characters.
|
||||
var filter: TokenStream = CJKWidthFilter(source)
|
||||
filter = LowerCaseFilter(filter)
|
||||
filter = CJKBigramFilter(filter)
|
||||
filter = NGramTokenFilter(filter, minGram, maxGram, preserveOriginal)
|
||||
filter = ASCIIFoldingFilter(filter)
|
||||
return TokenStreamComponents(source, filter)
|
||||
}
|
||||
}
|
@ -1,6 +1,5 @@
|
||||
package org.gotson.komga.infrastructure.search
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer
|
||||
import org.assertj.core.api.Assertions.assertThat
|
||||
import org.junit.jupiter.api.Test
|
||||
|
||||
@ -8,21 +7,6 @@ class MultilingualAnalyzerTest {
|
||||
|
||||
private val analyzer = MultiLingualAnalyzer()
|
||||
|
||||
private fun Analyzer.getTokens(text: String): List<String> {
|
||||
val tokenStream = tokenStream("text", text)
|
||||
|
||||
val tokens = mutableListOf<String>()
|
||||
tokenStream.use { ts ->
|
||||
ts.reset()
|
||||
while (ts.incrementToken()) {
|
||||
ts.reflectWith { _, key, value -> if (key == "term") tokens += value.toString() }
|
||||
}
|
||||
ts.end()
|
||||
}
|
||||
|
||||
return tokens
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `english text`() {
|
||||
// given
|
||||
|
@ -0,0 +1,81 @@
|
||||
package org.gotson.komga.infrastructure.search
|
||||
|
||||
import org.assertj.core.api.Assertions.assertThat
|
||||
import org.junit.jupiter.api.Test
|
||||
|
||||
class MultilingualNGramAnalyzerTest {
|
||||
|
||||
@Test
|
||||
fun `single letter`() {
|
||||
// given
|
||||
val text = "J"
|
||||
|
||||
// when
|
||||
val tokens = MultiLingualNGramAnalyzer(3, 8, false).getTokens(text)
|
||||
val tokensPreserveOriginal = MultiLingualNGramAnalyzer(3, 8, true).getTokens(text)
|
||||
|
||||
// then
|
||||
assertThat(tokensPreserveOriginal).containsExactly("j")
|
||||
assertThat(tokens).isEmpty()
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `chinese mixed`() {
|
||||
// given
|
||||
val text = "[不道德公會][河添太一 ][東立]Vol.04-搬运"
|
||||
|
||||
// when
|
||||
val tokens = MultiLingualNGramAnalyzer(3, 8, true).getTokens(text)
|
||||
|
||||
// then
|
||||
assertThat(tokens).containsExactly("不道", "道德", "德公", "公會", "河添", "添太", "太一", "東立", "vol", "04", "搬运")
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `chinese only`() {
|
||||
// given
|
||||
val text = "不道德公會河添太一東立搬运"
|
||||
|
||||
// when
|
||||
val tokens = MultiLingualNGramAnalyzer(3, 8, true).getTokens(text)
|
||||
|
||||
// then
|
||||
assertThat(tokens).containsExactly("不道", "道德", "德公", "公會", "會河", "河添", "添太", "太一", "一東", "東立", "立搬", "搬运")
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `hiragana only`() {
|
||||
// given
|
||||
val text = "探偵はもう、死んでいる。"
|
||||
|
||||
// when
|
||||
val tokens = MultiLingualNGramAnalyzer(3, 8, true).getTokens(text)
|
||||
|
||||
// then
|
||||
assertThat(tokens).containsExactly("探偵", "偵は", "はも", "もう", "死ん", "んで", "でい", "いる")
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `katakana only`() {
|
||||
// given
|
||||
val text = "ワンパンマン"
|
||||
|
||||
// when
|
||||
val tokens = MultiLingualNGramAnalyzer(3, 8, true).getTokens(text)
|
||||
|
||||
// then
|
||||
assertThat(tokens).containsExactly("ワン", "ンパ", "パン", "ンマ", "マン")
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `korean only`() {
|
||||
// given
|
||||
val text = "고교생을 환불해 주세요"
|
||||
|
||||
// when
|
||||
val tokens = MultiLingualNGramAnalyzer(3, 8, true).getTokens(text)
|
||||
|
||||
// then
|
||||
assertThat(tokens).containsExactly("고교", "교생", "생을", "환불", "불해", "주세", "세요")
|
||||
}
|
||||
}
|
@ -0,0 +1,18 @@
|
||||
package org.gotson.komga.infrastructure.search
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer
|
||||
|
||||
fun Analyzer.getTokens(text: String): List<String> {
|
||||
val tokenStream = tokenStream("text", text)
|
||||
|
||||
val tokens = mutableListOf<String>()
|
||||
tokenStream.use { ts ->
|
||||
ts.reset()
|
||||
while (ts.incrementToken()) {
|
||||
ts.reflectWith { _, key, value -> if (key == "term") tokens += value.toString() }
|
||||
}
|
||||
ts.end()
|
||||
}
|
||||
|
||||
return tokens
|
||||
}
|
Loading…
Reference in New Issue
Block a user