feat(app): Misc add wordninja

Leon406 · May 30, 2024 · ed84b00 · ed84b00
1 parent 438cdc9
commit ed84b00
Show file tree

Hide file tree

Showing 9 changed files with 126,257 additions and 4 deletions.
diff --git a/app/src/main/kotlin/me/leon/ToolsApp.kt b/app/src/main/kotlin/me/leon/ToolsApp.kt
@@ -108,6 +108,7 @@ class ToolsApp : App(Home::class, Styles::class) {
                     }
                 }
                 println(dict)
+                copyResourceFileIfNotExist("/wordninja_words.txt", WORDNINJA_DICT_FILE)
                 // 提前加载class,进行init
                 Translator.init()
             }

diff --git a/app/src/main/kotlin/me/leon/config/Config.kt b/app/src/main/kotlin/me/leon/config/Config.kt
@@ -21,3 +21,4 @@ const val WIKI = "https://github.com/Leon406/ToolsFx/wiki/Home"
 val APP_ROOT: String = File("").absolutePath
 val DICT_DIR: String = "$APP_ROOT/dict"
 val VOCABULARY_DIR: String = "$APP_ROOT/vocabulary"
+val WORDNINJA_DICT_FILE = File(VOCABULARY_DIR, "wordninja_words.txt")
diff --git a/app/src/main/kotlin/me/leon/misc/MiscConfig.kt b/app/src/main/kotlin/me/leon/misc/MiscConfig.kt
@@ -57,6 +57,7 @@ val MISC_CONFIG =
             ),
         MiscServiceType.VARIABLE_NAMING to
             mapOf(HINT to "variable naming convert, separate by line"),
+        MiscServiceType.WORD_SPLITER to mapOf(HINT to "split no space letters, eg. whoseyourdaddy"),
     )
 
 val MISC_OPTIONS_CONFIG =

diff --git a/app/src/main/kotlin/me/leon/misc/MiscServiceType.kt b/app/src/main/kotlin/me/leon/misc/MiscServiceType.kt
@@ -224,6 +224,14 @@ enum class MiscServiceType(val type: String) : MiscService {
             }
         }
     },
+    WORD_SPLITER("word ninja") {
+        override fun process(raw: String, params: Map<String, String>): String {
+            return raw.lineAction2String {
+                runCatching { Spliter.splitContiguousWords(it).joinToString(" ") }
+                    .getOrElse { it.stacktrace() }
+            }
+        }
+    },
     ;
 
     override fun hint(): String {

diff --git a/app/src/main/kotlin/me/leon/misc/Spliter.kt b/app/src/main/kotlin/me/leon/misc/Spliter.kt
@@ -0,0 +1,104 @@
+package me.leon.misc
+
+import kotlin.math.ln
+import kotlin.math.max
+import me.leon.config.WORDNINJA_DICT_FILE
+
+/**
+ * ported from
+ * https://stackoverflow.com/questions/8870261/how-to-split-text-without-spaces-into-list-of-words/11642687
+ */
+object Spliter {
+    private val splitRegex = "[^a-zA-Z0-9']+".toRegex()
+    private val wordCost = mutableMapOf<String, Number>()
+    private val maxWordLength: Int
+
+    init {
+        val dictionaryWords: List<String> = WORDNINJA_DICT_FILE.readLines()
+        // Build a cost dictionary, assuming Zipf's law and cost = -math.log(probability).
+        val lgDictSize = ln(dictionaryWords.size.toDouble())
+        var wordIdx = 0
+        for (word in dictionaryWords) {
+            wordCost[word] = ln(++wordIdx * lgDictSize)
+        }
+        maxWordLength = dictionaryWords.maxBy { it.length }.length
+    }
+
+    fun splitContiguousWords(sentence: String): List<String> {
+        val splitWords =
+            sentence
+                .split(splitRegex)
+                .filter { it.isNotEmpty() }
+                .fold(mutableListOf<String>()) { acc, s -> acc.apply { add(split(s)) } }
+        println("Split word for the sentence: $splitWords")
+        return splitWords
+    }
+
+    private fun split(partSentence: String): String {
+        // Build the cost array.
+        val cost = mutableListOf<Pair<Number, Number>>()
+        cost.add(0 to 0)
+        for (index in 1 until partSentence.length + 1) {
+            cost.add(bestMatch(partSentence, cost, index))
+        }
+
+        //  Backtrack to recover the minimal-cost string.
+        var idx = partSentence.length
+        val output = mutableListOf<String>()
+        while (idx > 0) {
+            val candidate = bestMatch(partSentence, cost, idx)
+            val candidateCost = candidate.first
+            val candidateIndexValue = candidate.second
+            if (candidateCost.toDouble() != cost[idx].first.toDouble()) {
+                error("Candidate cost unmatched; This should not be the case!")
+            }
+            // Apostrophe and digit handling
+            var newToken = true
+            val token = partSentence.substring(idx - candidateIndexValue.toInt(), idx)
+            if (token !== "'" && output.size > 0) {
+                val lastWord = output[output.size - 1]
+                if (
+                    lastWord.equals("'s", ignoreCase = true) ||
+                        partSentence[idx - 1].isDigit() && lastWord[0].isDigit()
+                ) {
+                    output[output.size - 1] = token + lastWord
+                    newToken = false
+                }
+            }
+            if (newToken) {
+                output.add(token)
+            }
+            idx -= candidateIndexValue.toInt()
+        }
+
+        return output.reversed().joinToString(" ")
+    }
+
+    /**
+     * Find the best match for the i first characters, assuming cost has been built for the i-1
+     * first characters. Returns a pair (match_cost, match_length).
+     */
+    private fun bestMatch(
+        partSentence: String,
+        cost: List<Pair<Number, Number>>,
+        index: Int
+    ): Pair<Number, Number> {
+
+        val candidates =
+            cost.subList(max(0.0, (index - maxWordLength).toDouble()).toInt(), index).reversed()
+        var enumerateIdx = 0
+        var minPair: Pair<Number, Number> = Int.MAX_VALUE to enumerateIdx
+        for (pair in candidates) {
+            ++enumerateIdx
+            val subsequence = partSentence.substring(index - enumerateIdx, index).lowercase()
+            var minCost: Number = Int.MAX_VALUE
+            if (wordCost.containsKey(subsequence)) {
+                minCost = pair.first.toDouble() + wordCost[subsequence]!!.toDouble()
+            }
+            if (minCost.toDouble() < minPair.first.toDouble()) {
+                minPair = minCost.toDouble() to enumerateIdx
+            }
+        }
+        return minPair
+    }
+}
diff --git a/app/src/main/kotlin/me/leon/misc/VariableNaming.kt b/app/src/main/kotlin/me/leon/misc/VariableNaming.kt
@@ -34,6 +34,7 @@ fun String.normalSpaceFormat(): String =
 
 /** （小）驼峰 hellLeon */
 fun String.camelNaming() = charTransform(Char::uppercase)
+
 // pascal case
 
 /** 帕斯卡命名 （大）驼峰 HellLeon */
@@ -62,6 +63,7 @@ fun String.snakeCaseNaming() = normalSpaceFormat().replace(" ", "_")
 
 /** 大写命名 HELLO_LEON, 常量 */
 fun String.uppercaseNaming() = snakeCaseNaming().uppercase()
+
 //
 /** kebab case， spinal case */
 fun String.dashNaming() = normalSpaceFormat().replace(" ", "-")