Skip to content

Commit

Permalink
profile and pythonCFG test passes
Browse files Browse the repository at this point in the history
  • Loading branch information
breandan committed Oct 15, 2023
1 parent 0a923fc commit c517d0c
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 53 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ infix fun FSA.intersectLevFSA(cfg: CFG) = cfg.intersectLevFSA(this)
// https://browse.arxiv.org/pdf/2209.06809.pdf#page=5

infix fun CFG.intersectLevFSA(fsa: FSA): CFG {
val clock = TimeSource.Monotonic.markNow()
var clock = TimeSource.Monotonic.markNow()
val initFinal =
(fsa.init * fsa.final).map { (q, r) -> "START -> [$q,START,$r]" }

Expand All @@ -27,38 +27,75 @@ infix fun CFG.intersectLevFSA(fsa: FSA): CFG {
second.coords().dominates(third.coords())
}

// For every production A → σ in P, for every (p, σ, q) ∈ Q × Σ × Q
// such that δ(p, σ) = q we have the production [p, A, q] → σ in P′.
val unitProds = unitProdRules(fsa)

// For each production A → BC in P, for every p, q, r ∈ Q,
// we have the production [p,A,r] → [p,B,q] [q,C,r] in P′.
val binaryProds =
nonterminalProductions.map {
val triples = fsa.states * fsa.states * fsa.states
val (A, B, C) = it.π1 to it.π2[0] to it.π2[1]
triples
// CFG ∩ FSA in general we are not allowed to do this, but it works
// because we assume a Levenshtein FSA which is monotone and acylic.
// CFG ∩ FSA - in general we are not allowed to do this, but it works
// because we assume a Levenshtein FSA, which is monotone and acyclic.
.filter { it.isValid() }
.map { (p, q, r) -> "[$p,$A,$r] -> [$p,$B,$q] [$q,$C,$r]" }
}.flatten()

// For every production A → σ in P, for every (p, σ, q) ∈ Q × Σ × Q
// such that δ(p, σ) = q we have the production [p, A, q] → σ in P′.
val unitProds =
unitProductions.map { (A, rhs) ->
val relevantTransits = fsa.Q.filter { it.π2 == rhs[0] }
relevantTransits.map { (p, σ, q) -> "[$p,$A,$q] -> $σ" }
}.flatten()
println("Constructing ∩-grammar took: ${clock.elapsedNow().inWholeMilliseconds}ms")
clock = TimeSource.Monotonic.markNow()
return (initFinal + transits + binaryProds + unitProds).postProcess()
.also { println("Postprocessing took ${clock.elapsedNow().inWholeMilliseconds}ms") }
}

return (initFinal + transits + binaryProds + unitProds).joinToString("\n")
.parseCFG(normalize = false)
private fun CFG.unitProdRules(fsa: FSA) =
unitProductions.map { (A, rhs) ->
val relevantTransits = fsa.Q.filter { it.π2 == rhs[0] }
relevantTransits.map { (p, σ, q) -> "[$p,$A,$q] -> $σ" }
}.flatten()

fun List<Σᐩ>.postProcess() =
joinToString("\n").parseCFG(normalize = false)
.also { println("∩-grammar has ${it.size} total productions") }
.dropVestigialProductions().normalForm.noNonterminalStubs
.also { println("∩-grammar has ${it.size} useful productions") }
.also { println("∩-grammar construction took: ${clock.elapsedNow().inWholeMilliseconds}ms") }
// .also { println(it.pretty) }
// .also { println(it.size) }
// .also { println(it.pretty) }
// .also { println(it.size) }

// Recursively removes all productions from a synthetic CFG containing a
// dangling nonterminal, i.e., a nonterminal that does not produce any terminals
//
// This works but is the most inefficient part of the current implementation...
//
// TODO: Maybe instead of creating an enormous CFG and then removing productions
// we can just create a CFG that only contains the productions we need, by
// starting from the terminals and working our way up to START?
// Consider:
// ∩-grammar has 96634 total productions
// Removed 81177 vestigial productions.
// Removed 15035 vestigial productions.
// Removed 331 vestigial productions.
// Removed 57 vestigial productions.
// Removed 7 vestigial productions.
// Removed 0 vestigial productions.
// Disabling nonterminal stubs!
// ∩-grammar has 56 useful productions <- Why can't we just create this CFG?!
fun CFG.dropVestigialProductions(
criteria: (Σᐩ) -> Boolean = { it.first() == '[' && it.last() == ']' && it.count { it == ',' } == 2 }
): CFG {
val nts: Set<Σᐩ> = map { it.LHS }.toSet()
// val reachable = reachableSymbols()
val rw = toMutableSet()
.apply { removeAll { prod -> !prod.RHS.all { !criteria(it) || (it in nts) } } }
.removeUselessSymbols()

println("Removed ${size - rw.size} vestigial productions.")

return if (rw.size == size) this else rw.dropVestigialProductions(criteria)
}


infix fun FSA.intersect(cfg: CFG) = cfg.intersect(this)

infix fun CFG.intersect(fsa: FSA): CFG {
Expand All @@ -69,6 +106,10 @@ infix fun CFG.intersect(fsa: FSA): CFG {
val transits =
fsa.Q.map { (q, a, r) -> "[$q,$a,$r] -> $a" }

// For every production A → σ in P, for every (p, σ, q) ∈ Q × Σ × Q
// such that δ(p, σ) = q we have the production [p, A, q] → σ in P′.
val unitProds = unitProdRules(fsa)

// For each production A → BC in P, for every p, q, r ∈ Q,
// we have the production [p,A,r] → [p,B,q] [q,C,r] in P′.
val binaryProds =
Expand All @@ -78,20 +119,6 @@ infix fun CFG.intersect(fsa: FSA): CFG {
triples.map { (p, q, r) -> "[$p,$A,$r] -> [$p,$B,$q] [$q,$C,$r]" }
}.flatten()

// For every production A → σ in P, for every (p, σ, q) ∈ Q × Σ × Q
// such that δ(p, σ) = q we have the production [p, A, q] → σ in P′.
val unitProds =
unitProductions.map { (A, rhs) ->
val relevantTransits = fsa.Q.filter { it.π2 == rhs[0] }
relevantTransits.map { (p, σ, q) -> "[$p,$A,$q] -> $σ" }
}.flatten()

return (initFinal + transits + binaryProds + unitProds).joinToString("\n")
.parseCFG(normalize = false)
.also { println("∩-grammar has ${it.size} total productions") }
.dropVestigialProductions().normalForm.noNonterminalStubs
.also { println("∩-grammar has ${it.size} useful productions") }
.also { println("∩-grammar construction took: ${clock.elapsedNow().inWholeMilliseconds}ms") }
// .also { println(it.pretty) }
// .also { println(it.size) }
return (initFinal + transits + binaryProds + unitProds).postProcess()
.also { println("Postprocessing took ${clock.elapsedNow().inWholeMilliseconds}ms") }
}
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ val CFG.noNonterminalStubs: CFG by cache {
val CFG.noEpsilonOrNonterminalStubs: CFG by cache {
println("Disabling nonterminal stubs!")
filter { it.RHS.none { it.isNonterminalStubIn(this) } }
.filter { "ε" !in it.toString() }.toSet().dropVestigialProductions()
.filter { "ε" !in it.toString() }.toSet()
.also { rewriteHistory.put(it, freeze().let { rewriteHistory[it]!! + listOf(it)}) }
.also { it.blocked.addAll(blocked) }
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,6 @@ import ai.hypergraph.kaliningraph.types.*

val rewriteHistory = LRUCache<CFG, List<CFG>>()

// Recursively removes all productions from a synthetic CFG containing a
// dangling nonterminal, i.e., a nonterminal that does not produce any terminals.
fun CFG.dropVestigialProductions(
criteria: (Σᐩ) -> Boolean = { it.first() == '[' && it.last() == ']' && it.count { it == ',' } == 2 }
): CFG {
val nts: Set<Σᐩ> = map { it.LHS }.toSet()
val rw = toMutableSet()
.apply { removeAll { !it.RHS.all { !criteria(it) || it in nts } } }
.removeUselessSymbols()

// println("Removed ${size - rw.size} vestigal productions.")

return if (rw.size == size) this else rw.dropVestigialProductions(criteria)
}

/**
* n.b. Normalization may destroy organic nonterminals!
* In order to preserve every organic nonterminal, you
Expand All @@ -47,8 +32,6 @@ fun CFG.normalize(): CFG =
// Must remember to run the unit test if order changes in the future
// ./gradlew jvmTest --tests "ai.hypergraph.kaliningraph.sat.SATValiantTest.testTLArithmetic"
.generateNonterminalStubs()
// Should only need to run this on synthetic CFGs
.dropVestigialProductions()
.also { cnf -> rewriteHistory.put(cnf.freeze(), rewrites) }
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ interface IGraph<G, E, V>: IGF<G, E, V>, Set<V>, Encodable
operator fun plus(that: G): G =
G((this - that) + (this join that) + (that - this))


operator fun minus(graph: G): G = G(vertices - graph.vertices)

infix fun join(that: G): Set<V> =
Expand Down Expand Up @@ -299,9 +300,10 @@ abstract class Edge<G, E, V>(override val source: V, override val target: V) :
abstract class Vertex<G, E, V>(override val id: String) :
AGF<G, E, V>(), IVertex<G, E, V>
where G : Graph<G, E, V>, E : Edge<G, E, V>, V : Vertex<G, E, V> {
val hash by lazy { id.hashCode() }
override fun equals(other: Any?) = (other as? Vertex<*, *, *>)?.let { id == it.id } ?: false
override fun encode() = id.vectorize()
override fun hashCode() = id.hashCode()
override fun hashCode() = hash
override fun toString() = id
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -192,8 +192,9 @@ class BarHillelTest {
@Test
fun testPythonBarHillel() {
val gram = SetValiantTest.seq2parsePythonCFG.noEpsilonOrNonterminalStubs
gram.intersectLevFSA(makeLevFSA("1 + 2", 1, gram.terminals))
.enumSeq(List(5) { "_" }.joinToString(" "))
gram.intersectLevFSA(makeLevFSA("NUMBER + NEWLINE", 1, gram.terminals))
.also { println("LEV ∩ CFG grammar:\n${it.pretty}") }
.enumSeq(List(4) { "_" }.joinToString(" "))
.onEach { println(it) }.toList()
}
}

0 comments on commit c517d0c

Please sign in to comment.