DATA: Improve processing of tokens

dwhieb · Jul 21, 2024 · 96629ed · 96629ed
1 parent b10ddef
commit 96629ed
Show file tree

Hide file tree

Showing 43 changed files with 11,938 additions and 11,958 deletions.
diff --git a/data/Components.js b/data/Components.js
@@ -114,24 +114,31 @@ class Token {
     notes,
     orthography,
     PA,
+    pages,
+    source,
     speaker,
     UR,
   }) {
 
     const isProto = language.includes(`Proto`)
 
+    this.bibliography = source
+
+    if (pages) {
+      this.bibliography += `: `
+      this.bibliography += parsePages(pages)
+    }
+
     if (form) {
       this.form = form.normalize()
       if (isProto) this.form = `*${ cleanProto(form) }`
     }
 
-    this.bibliography = bibliography
-    if (gloss) this.gloss = gloss
-    if (notes) this.notes = notes
-    this.orthography = orthography
-    if (PA) this.PA = cleanProto(PA)
+    if (gloss) this.gloss     = cleanGloss(gloss)
+    if (notes) this.notes     = notes.normalize()
+    if (PA) this.PA           = cleanProto(PA)
     if (speaker) this.speaker = speaker.normalize()
-    if (UR) this.UR = UR.normalize()
+    if (UR) this.UR           = cleanUR(UR)
 
   }
 }
@@ -446,45 +453,18 @@ export default class Components extends Map {
 
     const cols = Components.columns
 
-    // UR
-    const UR = cleanUR(record[cols.UR])
-
-    // Proto-Algonquian
-    const PA = record[cols.proto]?.normalize()
-
-    // Gloss
-    const gloss = cleanGloss(record[cols.gloss])
-
-    // Bibliography
-    const source = record[cols.sourceCode]
-
-    let   bibliography = source
-    const pages        = record[cols.pages]
-
-    if (pages) {
-      bibliography += `: `
-      bibliography += parsePages(record[cols.pages])
-    }
-
-    // Speaker
-    const speaker = record[cols.speaker]?.normalize()
-
-    // Notes
-    const notes = record[cols.notes]?.normalize()
-
-    // Orthography Key
-    const orthography = record[cols.orthography]
-
     return new Token({
-      bibliography,
-      form: record[cols.originalOrthography],
-      gloss,
+      bibliography: record[cols.bibliography],
+      form:         record[cols.originalOrthography],
+      gloss:        record[cols.gloss],
       language,
-      notes,
-      orthography,
-      PA,
-      speaker,
-      UR,
+      notes:        record[cols.notes],
+      orthography:  record[cols.orthography],
+      PA:           record[cols.proto],
+      pages:        record[cols.pages],
+      source:       record[cols.sourceCode],
+      speaker:      record[cols.speaker],
+      UR:           record[cols.UR],
     })
 
   }

diff --git a/data/json/components/Abenaki.ndjson b/data/json/components/Abenaki.ndjson
@@ -1,5 +1,5 @@
-{"componentID":"1","ID":"Abenaki-1","language":"Abenaki","displayLanguage":"Abenaki","dialect":"","Glottocode":"aben1250","ISO":"","form":"ɔ̃ben-","PA":"*a·p-","displayForm":"ɔ̃ben-","definition":"","type":"initial","subcategory":"","reduplicated":false,"deverbal":false,"matches":{},"notes":"GVM: Looks like an initial, but is listed as \"(TA, TI)\"","tokens":[{"form":"ôben-","bibliography":"IG1965: 213","gloss":"untie","notes":"GVM: Looks like an initial, but is listed as \"(TA, TI)\"","orthography":"AB:2","PA":"a·p-"}]}
-{"componentID":"2","ID":"Abenaki-2","language":"Abenaki","displayLanguage":"Abenaki","dialect":"","Glottocode":"aben1250","ISO":"","form":"ɔ̃jemi-","PA":"*a·t-","displayForm":"ɔ̃jemi-","definition":"","type":"initial","subcategory":"","reduplicated":false,"deverbal":false,"matches":{},"tokens":[{"form":"ôjemi-","bibliography":"IG1965: 214","gloss":"relate, declare","orthography":"AB:2","PA":"a·t-"}]}
-{"componentID":"3","ID":"Abenaki-3","language":"Abenaki","displayLanguage":"Abenaki","dialect":"","Glottocode":"aben1250","ISO":"","form":"ɔ̃tl-","PA":"*a·nt-","displayForm":"ɔ̃tl-","definition":"","type":"initial","subcategory":"","reduplicated":false,"deverbal":false,"matches":{},"notes":"GVM: Looks like an initial, but is listed as \"(TA, TI)\". MAM: this is a PA initial, so I don't know why it would say that.","tokens":[{"form":"ôtl-","bibliography":"IG1965: 214","gloss":"move, change, afresh, anew","notes":"GVM: Looks like an initial, but is listed as \"(TA, TI)\"","orthography":"AB:2","PA":"a·nt-"},{"form":"ôtto-","bibliography":"IG1965: 214","gloss":"move, change, afresh, anew","notes":"GVM: Looks like an initial, but is listed as \"(TA, TI)\"","orthography":"AB:2","PA":"a·nt-"}]}
-{"componentID":"5","ID":"Abenaki-5","language":"Abenaki","displayLanguage":"Abenaki","dialect":"","Glottocode":"aben1250","ISO":"","form":"-gɔ̃bawi","PA":"*-ka·pawi-","displayForm":"-gɔ̃bawi","definition":"","type":"final","subcategory":"AI","specificity":"","primary":true,"secondary":false,"deverbal":false,"matches":{},"tokens":[{"form":"-gan̈ba8i","bibliography":"IG1965: 215","gloss":"stand, be standing","orthography":"AB:2","PA":"-ka·pawi-"}]}
-{"componentID":"6","ID":"Abenaki-6","language":"Abenaki","displayLanguage":"Abenaki","dialect":"","Glottocode":"aben1250","ISO":"","form":"-igɔ̃n","displayForm":"-igɔ̃n","definition":"","type":"final","subcategory":"N","specificity":"","primary":true,"secondary":false,"deverbal":false,"matches":{},"notes":"GVM: From context, probably N final? Says \"for -igan read -igan̈n\".","tokens":[{"form":"-igan̈n","bibliography":"IG1965: 219","notes":"GVM: From context, probably N final? Says \"for -igan read -igan̈n\".","orthography":"AB:2"}]}
+{"componentID":"1","ID":"Abenaki-1","language":"Abenaki","displayLanguage":"Abenaki","dialect":"","Glottocode":"aben1250","ISO":"","form":"ɔ̃ben-","PA":"*a·p-","displayForm":"ɔ̃ben-","definition":"","type":"initial","subcategory":"","reduplicated":false,"deverbal":false,"matches":{},"notes":"GVM: Looks like an initial, but is listed as \"(TA, TI)\"","tokens":[{"bibliography":"IG1965: 213","form":"ôben-","gloss":"untie","notes":"GVM: Looks like an initial, but is listed as \"(TA, TI)\"","PA":"a·p-"}]}
+{"componentID":"2","ID":"Abenaki-2","language":"Abenaki","displayLanguage":"Abenaki","dialect":"","Glottocode":"aben1250","ISO":"","form":"ɔ̃jemi-","PA":"*a·t-","displayForm":"ɔ̃jemi-","definition":"","type":"initial","subcategory":"","reduplicated":false,"deverbal":false,"matches":{},"tokens":[{"bibliography":"IG1965: 214","form":"ôjemi-","gloss":"relate, declare","PA":"a·t-"}]}
+{"componentID":"3","ID":"Abenaki-3","language":"Abenaki","displayLanguage":"Abenaki","dialect":"","Glottocode":"aben1250","ISO":"","form":"ɔ̃tl-","PA":"*a·nt-","displayForm":"ɔ̃tl-","definition":"","type":"initial","subcategory":"","reduplicated":false,"deverbal":false,"matches":{},"notes":"GVM: Looks like an initial, but is listed as \"(TA, TI)\". MAM: this is a PA initial, so I don't know why it would say that.","tokens":[{"bibliography":"IG1965: 214","form":"ôtl-","gloss":"move, change, afresh, anew","notes":"GVM: Looks like an initial, but is listed as \"(TA, TI)\"","PA":"a·nt-"},{"bibliography":"IG1965: 214","form":"ôtto-","gloss":"move, change, afresh, anew","notes":"GVM: Looks like an initial, but is listed as \"(TA, TI)\"","PA":"a·nt-"}]}
+{"componentID":"5","ID":"Abenaki-5","language":"Abenaki","displayLanguage":"Abenaki","dialect":"","Glottocode":"aben1250","ISO":"","form":"-gɔ̃bawi","PA":"*-ka·pawi-","displayForm":"-gɔ̃bawi","definition":"","type":"final","subcategory":"AI","specificity":"","primary":true,"secondary":false,"deverbal":false,"matches":{},"tokens":[{"bibliography":"IG1965: 215","form":"-gan̈ba8i","gloss":"stand, be standing","PA":"-ka·pawi-"}]}
+{"componentID":"6","ID":"Abenaki-6","language":"Abenaki","displayLanguage":"Abenaki","dialect":"","Glottocode":"aben1250","ISO":"","form":"-igɔ̃n","displayForm":"-igɔ̃n","definition":"","type":"final","subcategory":"N","specificity":"","primary":true,"secondary":false,"deverbal":false,"matches":{},"notes":"GVM: From context, probably N final? Says \"for -igan read -igan̈n\".","tokens":[{"bibliography":"IG1965: 219","form":"-igan̈n","gloss":"","notes":"GVM: From context, probably N final? Says \"for -igan read -igan̈n\"."}]}