profile and pythonCFG test passes

breandan · Oct 15, 2023 · c517d0c · c517d0c
1 parent 0a923fc
commit c517d0c
Show file tree

Hide file tree

Showing 5 changed files with 66 additions and 53 deletions.
diff --git a/src/commonMain/kotlin/ai/hypergraph/kaliningraph/parsing/BarHillel.kt b/src/commonMain/kotlin/ai/hypergraph/kaliningraph/parsing/BarHillel.kt
@@ -8,7 +8,7 @@ infix fun FSA.intersectLevFSA(cfg: CFG) = cfg.intersectLevFSA(this)
 // https://browse.arxiv.org/pdf/2209.06809.pdf#page=5
 
 infix fun CFG.intersectLevFSA(fsa: FSA): CFG {
-  val clock = TimeSource.Monotonic.markNow()
+  var clock = TimeSource.Monotonic.markNow()
   val initFinal =
     (fsa.init * fsa.final).map { (q, r) -> "START -> [$q,START,$r]" }
 
@@ -27,38 +27,75 @@ infix fun CFG.intersectLevFSA(fsa: FSA): CFG {
       second.coords().dominates(third.coords())
   }
 
+  // For every production A → σ in P, for every (p, σ, q) ∈ Q × Σ × Q
+  // such that δ(p, σ) = q we have the production [p, A, q] → σ in P′.
+  val unitProds = unitProdRules(fsa)
+
   // For each production A → BC in P, for every p, q, r ∈ Q,
   // we have the production [p,A,r] → [p,B,q] [q,C,r] in P′.
   val binaryProds =
     nonterminalProductions.map {
       val triples = fsa.states * fsa.states * fsa.states
       val (A, B, C) = it.π1 to it.π2[0] to it.π2[1]
       triples
-        // CFG ∩ FSA in general we are not allowed to do this, but it works
-        // because we assume a Levenshtein FSA which is monotone and acylic.
+        // CFG ∩ FSA - in general we are not allowed to do this, but it works
+        // because we assume a Levenshtein FSA, which is monotone and acyclic.
         .filter { it.isValid() }
         .map { (p, q, r) -> "[$p,$A,$r] -> [$p,$B,$q] [$q,$C,$r]" }
     }.flatten()
 
-  // For every production A → σ in P, for every (p, σ, q) ∈ Q × Σ × Q
-  // such that δ(p, σ) = q we have the production [p, A, q] → σ in P′.
-  val unitProds =
-    unitProductions.map { (A, rhs) ->
-      val relevantTransits = fsa.Q.filter { it.π2 == rhs[0] }
-      relevantTransits.map { (p, σ, q) -> "[$p,$A,$q] -> $σ" }
-    }.flatten()
+  println("Constructing ∩-grammar took: ${clock.elapsedNow().inWholeMilliseconds}ms")
+  clock = TimeSource.Monotonic.markNow()
+  return (initFinal + transits + binaryProds + unitProds).postProcess()
+    .also { println("Postprocessing took ${clock.elapsedNow().inWholeMilliseconds}ms") }
+}
 
-  return (initFinal + transits + binaryProds + unitProds).joinToString("\n")
-    .parseCFG(normalize = false)
+private fun CFG.unitProdRules(fsa: FSA) =
+  unitProductions.map { (A, rhs) ->
+    val relevantTransits = fsa.Q.filter { it.π2 == rhs[0] }
+    relevantTransits.map { (p, σ, q) -> "[$p,$A,$q] -> $σ" }
+  }.flatten()
+
+fun List<Σᐩ>.postProcess() =
+  joinToString("\n").parseCFG(normalize = false)
     .also { println("∩-grammar has ${it.size} total productions") }
     .dropVestigialProductions().normalForm.noNonterminalStubs
     .also { println("∩-grammar has ${it.size} useful productions") }
-    .also { println("∩-grammar construction took: ${clock.elapsedNow().inWholeMilliseconds}ms") }
-//    .also { println(it.pretty) }
-//    .also { println(it.size) }
+    //    .also { println(it.pretty) }
+    //    .also { println(it.size) }
+
+// Recursively removes all productions from a synthetic CFG containing a
+// dangling nonterminal, i.e., a nonterminal that does not produce any terminals
+//
+// This works but is the most inefficient part of the current implementation...
+//
+// TODO: Maybe instead of creating an enormous CFG and then removing productions
+//       we can just create a CFG that only contains the productions we need, by
+//       starting from the terminals and working our way up to START?
+//  Consider:
+//    ∩-grammar has 96634 total productions
+//    Removed 81177 vestigial productions.
+//    Removed 15035 vestigial productions.
+//    Removed 331 vestigial productions.
+//    Removed 57 vestigial productions.
+//    Removed 7 vestigial productions.
+//    Removed 0 vestigial productions.
+//    Disabling nonterminal stubs!
+//    ∩-grammar has 56 useful productions <- Why can't we just create this CFG?!
+fun CFG.dropVestigialProductions(
+  criteria: (Σᐩ) -> Boolean = { it.first() == '[' && it.last() == ']' && it.count { it == ',' } == 2 }
+): CFG {
+  val nts: Set<Σᐩ> = map { it.LHS }.toSet()
+//  val reachable = reachableSymbols()
+  val rw = toMutableSet()
+    .apply { removeAll { prod -> !prod.RHS.all { !criteria(it) || (it in nts) } } }
+    .removeUselessSymbols()
+
+  println("Removed ${size - rw.size} vestigial productions.")
+
+  return if (rw.size == size) this else rw.dropVestigialProductions(criteria)
 }
 
-
 infix fun FSA.intersect(cfg: CFG) = cfg.intersect(this)
 
 infix fun CFG.intersect(fsa: FSA): CFG {
@@ -69,6 +106,10 @@ infix fun CFG.intersect(fsa: FSA): CFG {
   val transits =
     fsa.Q.map { (q, a, r) -> "[$q,$a,$r] -> $a" }
 
+  // For every production A → σ in P, for every (p, σ, q) ∈ Q × Σ × Q
+  // such that δ(p, σ) = q we have the production [p, A, q] → σ in P′.
+  val unitProds = unitProdRules(fsa)
+
   // For each production A → BC in P, for every p, q, r ∈ Q,
   // we have the production [p,A,r] → [p,B,q] [q,C,r] in P′.
   val binaryProds =
@@ -78,20 +119,6 @@ infix fun CFG.intersect(fsa: FSA): CFG {
       triples.map { (p, q, r) -> "[$p,$A,$r] -> [$p,$B,$q] [$q,$C,$r]" }
     }.flatten()
 
-  // For every production A → σ in P, for every (p, σ, q) ∈ Q × Σ × Q
-  // such that δ(p, σ) = q we have the production [p, A, q] → σ in P′.
-  val unitProds =
-    unitProductions.map { (A, rhs) ->
-      val relevantTransits = fsa.Q.filter { it.π2 == rhs[0] }
-      relevantTransits.map { (p, σ, q) -> "[$p,$A,$q] -> $σ" }
-    }.flatten()
-
-  return (initFinal + transits + binaryProds + unitProds).joinToString("\n")
-    .parseCFG(normalize = false)
-    .also { println("∩-grammar has ${it.size} total productions") }
-    .dropVestigialProductions().normalForm.noNonterminalStubs
-    .also { println("∩-grammar has ${it.size} useful productions") }
-    .also { println("∩-grammar construction took: ${clock.elapsedNow().inWholeMilliseconds}ms") }
-//    .also { println(it.pretty) }
-//    .also { println(it.size) }
+  return (initFinal + transits + binaryProds + unitProds).postProcess()
+    .also { println("Postprocessing took ${clock.elapsedNow().inWholeMilliseconds}ms") }
 }
diff --git a/src/commonMain/kotlin/ai/hypergraph/kaliningraph/parsing/CFG.kt b/src/commonMain/kotlin/ai/hypergraph/kaliningraph/parsing/CFG.kt
@@ -100,7 +100,7 @@ val CFG.noNonterminalStubs: CFG by cache {
 val CFG.noEpsilonOrNonterminalStubs: CFG by cache {
   println("Disabling nonterminal stubs!")
   filter { it.RHS.none { it.isNonterminalStubIn(this) } }
-    .filter { "ε" !in it.toString() }.toSet().dropVestigialProductions()
+    .filter { "ε" !in it.toString() }.toSet()
     .also { rewriteHistory.put(it, freeze().let { rewriteHistory[it]!! + listOf(it)}) }
     .also { it.blocked.addAll(blocked) }
 }

diff --git a/src/commonMain/kotlin/ai/hypergraph/kaliningraph/parsing/Normalization.kt b/src/commonMain/kotlin/ai/hypergraph/kaliningraph/parsing/Normalization.kt
@@ -12,21 +12,6 @@ import ai.hypergraph.kaliningraph.types.*
 
 val rewriteHistory = LRUCache<CFG, List<CFG>>()
 
-// Recursively removes all productions from a synthetic CFG containing a
-// dangling nonterminal, i.e., a nonterminal that does not produce any terminals.
-fun CFG.dropVestigialProductions(
-  criteria: (Σᐩ) -> Boolean = { it.first() == '[' && it.last() == ']' && it.count { it == ',' } == 2 }
-): CFG {
-  val nts: Set<Σᐩ> = map { it.LHS }.toSet()
-  val rw = toMutableSet()
-    .apply { removeAll { !it.RHS.all { !criteria(it) || it in nts } } }
-    .removeUselessSymbols()
-
-//  println("Removed ${size - rw.size} vestigal productions.")
-
-  return if (rw.size == size) this else rw.dropVestigialProductions(criteria)
-}
-
 /**
  * n.b. Normalization may destroy organic nonterminals!
  * In order to preserve every organic nonterminal, you
@@ -47,8 +32,6 @@ fun CFG.normalize(): CFG =
       // Must remember to run the unit test if order changes in the future
       // ./gradlew jvmTest --tests "ai.hypergraph.kaliningraph.sat.SATValiantTest.testTLArithmetic"
       .generateNonterminalStubs()
-      // Should only need to run this on synthetic CFGs
-      .dropVestigialProductions()
       .also { cnf -> rewriteHistory.put(cnf.freeze(), rewrites) }
   }
 

diff --git a/src/commonMain/kotlin/ai/hypergraph/kaliningraph/types/Graph.kt b/src/commonMain/kotlin/ai/hypergraph/kaliningraph/types/Graph.kt
@@ -88,6 +88,7 @@ interface IGraph<G, E, V>: IGF<G, E, V>, Set<V>, Encodable
   operator fun plus(that: G): G =
     G((this - that) + (this join that) + (that - this))
 
+
   operator fun minus(graph: G): G = G(vertices - graph.vertices)
 
   infix fun join(that: G): Set<V> =
@@ -299,9 +300,10 @@ abstract class Edge<G, E, V>(override val source: V, override val target: V) :
 abstract class Vertex<G, E, V>(override val id: String) :
   AGF<G, E, V>(), IVertex<G, E, V>
   where G : Graph<G, E, V>, E : Edge<G, E, V>, V : Vertex<G, E, V> {
+    val hash by lazy { id.hashCode() }
   override fun equals(other: Any?) = (other as? Vertex<*, *, *>)?.let { id == it.id } ?: false
   override fun encode() = id.vectorize()
-  override fun hashCode() = id.hashCode()
+  override fun hashCode() = hash
   override fun toString() = id
 }
 

diff --git a/src/commonTest/kotlin/ai/hypergraph/kaliningraph/parsing/BarHillelTest.kt b/src/commonTest/kotlin/ai/hypergraph/kaliningraph/parsing/BarHillelTest.kt
@@ -192,8 +192,9 @@ class BarHillelTest {
   @Test
   fun testPythonBarHillel() {
     val gram = SetValiantTest.seq2parsePythonCFG.noEpsilonOrNonterminalStubs
-    gram.intersectLevFSA(makeLevFSA("1 + 2", 1, gram.terminals))
-      .enumSeq(List(5) { "_" }.joinToString(" "))
+    gram.intersectLevFSA(makeLevFSA("NUMBER + NEWLINE", 1, gram.terminals))
+      .also { println("LEV ∩ CFG grammar:\n${it.pretty}") }
+      .enumSeq(List(4) { "_" }.joinToString(" "))
       .onEach { println(it) }.toList()
   }
 }