Skip to content

Commit

Permalink
feat(app): Misc add wordninja
Browse files Browse the repository at this point in the history
  • Loading branch information
Leon406 committed May 30, 2024
1 parent 438cdc9 commit ed84b00
Show file tree
Hide file tree
Showing 9 changed files with 126,257 additions and 4 deletions.
1 change: 1 addition & 0 deletions app/src/main/kotlin/me/leon/ToolsApp.kt
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ class ToolsApp : App(Home::class, Styles::class) {
}
}
println(dict)
copyResourceFileIfNotExist("/wordninja_words.txt", WORDNINJA_DICT_FILE)
// 提前加载class,进行init
Translator.init()
}
Expand Down
1 change: 1 addition & 0 deletions app/src/main/kotlin/me/leon/config/Config.kt
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,4 @@ const val WIKI = "https://github.com/Leon406/ToolsFx/wiki/Home"
val APP_ROOT: String = File("").absolutePath
val DICT_DIR: String = "$APP_ROOT/dict"
val VOCABULARY_DIR: String = "$APP_ROOT/vocabulary"
val WORDNINJA_DICT_FILE = File(VOCABULARY_DIR, "wordninja_words.txt")
1 change: 1 addition & 0 deletions app/src/main/kotlin/me/leon/misc/MiscConfig.kt
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ val MISC_CONFIG =
),
MiscServiceType.VARIABLE_NAMING to
mapOf(HINT to "variable naming convert, separate by line"),
MiscServiceType.WORD_SPLITER to mapOf(HINT to "split no space letters, eg. whoseyourdaddy"),
)

val MISC_OPTIONS_CONFIG =
Expand Down
8 changes: 8 additions & 0 deletions app/src/main/kotlin/me/leon/misc/MiscServiceType.kt
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,14 @@ enum class MiscServiceType(val type: String) : MiscService {
}
}
},
WORD_SPLITER("word ninja") {
override fun process(raw: String, params: Map<String, String>): String {
return raw.lineAction2String {
runCatching { Spliter.splitContiguousWords(it).joinToString(" ") }
.getOrElse { it.stacktrace() }
}
}
},
;

override fun hint(): String {
Expand Down
104 changes: 104 additions & 0 deletions app/src/main/kotlin/me/leon/misc/Spliter.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
package me.leon.misc

import kotlin.math.ln
import kotlin.math.max
import me.leon.config.WORDNINJA_DICT_FILE

/**
* ported from
* https://stackoverflow.com/questions/8870261/how-to-split-text-without-spaces-into-list-of-words/11642687
*/
object Spliter {
private val splitRegex = "[^a-zA-Z0-9']+".toRegex()
private val wordCost = mutableMapOf<String, Number>()
private val maxWordLength: Int

init {
val dictionaryWords: List<String> = WORDNINJA_DICT_FILE.readLines()
// Build a cost dictionary, assuming Zipf's law and cost = -math.log(probability).
val lgDictSize = ln(dictionaryWords.size.toDouble())
var wordIdx = 0
for (word in dictionaryWords) {
wordCost[word] = ln(++wordIdx * lgDictSize)
}
maxWordLength = dictionaryWords.maxBy { it.length }.length
}

fun splitContiguousWords(sentence: String): List<String> {
val splitWords =
sentence
.split(splitRegex)
.filter { it.isNotEmpty() }
.fold(mutableListOf<String>()) { acc, s -> acc.apply { add(split(s)) } }
println("Split word for the sentence: $splitWords")
return splitWords
}

private fun split(partSentence: String): String {
// Build the cost array.
val cost = mutableListOf<Pair<Number, Number>>()
cost.add(0 to 0)
for (index in 1 until partSentence.length + 1) {
cost.add(bestMatch(partSentence, cost, index))
}

// Backtrack to recover the minimal-cost string.
var idx = partSentence.length
val output = mutableListOf<String>()
while (idx > 0) {
val candidate = bestMatch(partSentence, cost, idx)
val candidateCost = candidate.first
val candidateIndexValue = candidate.second
if (candidateCost.toDouble() != cost[idx].first.toDouble()) {
error("Candidate cost unmatched; This should not be the case!")
}
// Apostrophe and digit handling
var newToken = true
val token = partSentence.substring(idx - candidateIndexValue.toInt(), idx)
if (token !== "'" && output.size > 0) {
val lastWord = output[output.size - 1]
if (
lastWord.equals("'s", ignoreCase = true) ||
partSentence[idx - 1].isDigit() && lastWord[0].isDigit()
) {
output[output.size - 1] = token + lastWord
newToken = false
}
}
if (newToken) {
output.add(token)
}
idx -= candidateIndexValue.toInt()
}

return output.reversed().joinToString(" ")
}

/**
* Find the best match for the i first characters, assuming cost has been built for the i-1
* first characters. Returns a pair (match_cost, match_length).
*/
private fun bestMatch(
partSentence: String,
cost: List<Pair<Number, Number>>,
index: Int
): Pair<Number, Number> {

val candidates =
cost.subList(max(0.0, (index - maxWordLength).toDouble()).toInt(), index).reversed()
var enumerateIdx = 0
var minPair: Pair<Number, Number> = Int.MAX_VALUE to enumerateIdx
for (pair in candidates) {
++enumerateIdx
val subsequence = partSentence.substring(index - enumerateIdx, index).lowercase()
var minCost: Number = Int.MAX_VALUE
if (wordCost.containsKey(subsequence)) {
minCost = pair.first.toDouble() + wordCost[subsequence]!!.toDouble()
}
if (minCost.toDouble() < minPair.first.toDouble()) {
minPair = minCost.toDouble() to enumerateIdx
}
}
return minPair
}
}
2 changes: 2 additions & 0 deletions app/src/main/kotlin/me/leon/misc/VariableNaming.kt
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ fun String.normalSpaceFormat(): String =

/** (小)驼峰 hellLeon */
fun String.camelNaming() = charTransform(Char::uppercase)

// pascal case

/** 帕斯卡命名 (大)驼峰 HellLeon */
Expand Down Expand Up @@ -62,6 +63,7 @@ fun String.snakeCaseNaming() = normalSpaceFormat().replace(" ", "_")

/** 大写命名 HELLO_LEON, 常量 */
fun String.uppercaseNaming() = snakeCaseNaming().uppercase()

//
/** kebab case, spinal case */
fun String.dashNaming() = normalSpaceFormat().replace(" ", "-")
Expand Down
Loading

0 comments on commit ed84b00

Please sign in to comment.