-
Notifications
You must be signed in to change notification settings - Fork 1
/
lexibank_leekoreanic.py
168 lines (144 loc) · 6.37 KB
/
lexibank_leekoreanic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
from pathlib import Path
from collections import defaultdict
import pylexibank
from clldutils.misc import slug
import attr
from openpyxl import load_workbook
DATAFILE = "journal.pone.0128448.s001.xlsx"
# this is to fix cognates when cells with multiple forms have the incorrect
# number of cognates, e.g.
# na-mu/naŋ 1 -- this is split to two forms "na-mu" and "naŋ", but we don't know
# what cognate set 1 refers to without checking the data. This dictionary remaps
# these entries so that the cognate sets line up with the correct lexical forms
# so we have:
# "na-mu/naŋ": "1 & #",
# ...which means that "na-mu" is cognate set 1, while "naŋ" has no cognate
FIXED_COGNATES = {
# tree
"na-mu/naŋ": "1 & #",
"na-mu/naŋ-gu": "1 & #",
"na-mu/naŋ-kʰi": "1 & #",
"naŋ-kʰi/na-mu": "# & 1",
"na-mu/naŋ-gi": "1 & #",
"naŋ-gu/na-mu": "# & 1",
# below
"mi-te/a-re": "1 & 2", # typo in original?
# hair
"thǝ-ri/thǝ-rǝk": "2 & 2",
# yawn
"ha-pɨi-om/ha-ø-jom": "1 & 1",
# sea
"pa-da/pa-ral": "1 & 1",
# old
"jet/nət": "1 & 1",
}
@attr.s
class CustomConcept(pylexibank.Concept):
Korean_Gloss = attr.ib(default=None)
Number = attr.ib(default=None)
def read_data(filename):
excel = load_workbook(filename, read_only=True)
sheet = excel["Raw data"]
for row_id, row in enumerate(sheet.rows):
if row_id in (0, 2):
continue # skip these
elif row_id == 1:
# construct a lookup table of <word> -> col indexes
# each word should have two indexes, the first will be the
# lexeme, the second will be the cognate.
words = defaultdict(list)
for i, h in enumerate(row):
if i == 0:
continue # skip the language column
words[h.value.strip()].append(i)
elif row_id > 17:
break # no more data
else:
for word in words:
lang = row[0].value.strip()
gloss = row[words[word][0]]
cognate = row[words[word][1]]
if gloss.value.strip() == "#":
continue # skip empty records
src = None
# Two hundred and forty six (246) basic vocabulary items [19,20] were extracted
# from each of 14 living and one (1) ancient Koreanic variants using multiple
# sources: (i) a large field collection made by Shimpei Ogura [21],
# (ii) a modern dictionary of Koreanic variants that combine lexi- cons from
# several different references [22], and (iii) an etymological glossary
# of Middle Korean that contains lexicons sampled from over 240 historical
# documents [23].
#
# -- Interpreting this as:
# - all Middle Korean are from r23:
# Nam K. Kyohak koe sacen (A Middle Korean dictionary). Seoul: Kyohaksa; 2014
# - all underlined forms in excel sheet are from r22:
# Nanmal ohwi chongbo chori yonguso. Urimal pangen sacen (A dictionary of Korean
# dialects). Seoul: Nanmal ohwi chongbo chori yonguso; 2010.
# - everything else is from r21
# Ogura S. Chosengo hogen no kenkyu (A study of Korean dialects). Tokyo:
# Iwanami Shoten; 1944.
# Underlined items were extracted from Nanmal Ohwi Chongbo Chori Yonguso
# Middle Korean was extracted from GW Nam's collection
if lang == "Middle Korean":
src = "Nam2014"
elif gloss.font.underline:
src = "Nanmal2010"
else:
src = "Ogura1944"
yield (lang, word, gloss.value.strip(), cognate.value, src)
class Dataset(pylexibank.Dataset):
dir = Path(__file__).parent
id = "leekoreanic"
concept_class = CustomConcept
writer_options = dict(keep_languages=False, keep_parameters=False)
# define the way in which forms should be handled
form_spec = pylexibank.FormSpec(
brackets={"(": ")"}, # characters that function as brackets
separators=";/,", # characters that split forms e.g. "a, b".
missing_data=("?", "-", "#"), # characters that denote missing data.
strip_inside_brackets=True, # do you want data removed in brackets or not?
)
def cmd_download(self, args):
self.raw_dir.download("https://doi.org/10.1371/journal.pone.0128448.s001", DATAFILE)
def cmd_makecldf(self, args):
"""
Convert the raw data to a CLDF dataset.
"""
args.writer.add_sources()
concepts = {}
for concept in self.conceptlists[0].concepts.values():
idx = concept.number + "_" + slug(concept.english)
args.writer.add_concept(
ID=idx,
Name=concept.english,
Concepticon_ID=concept.concepticon_id,
Concepticon_Gloss=concept.concepticon_gloss,
Korean_Gloss=concept.attributes["korean"],
)
concepts[concept.english] = idx
languages = args.writer.add_languages(lookup_factory=lambda l: l["Name"])
for lang, word, gloss, cognate, src in read_data(self.raw_dir / DATAFILE):
lex = args.writer.add_forms_from_value(
Language_ID=languages[lang], Parameter_ID=concepts[word], Value=gloss, Source=[src]
)
# sort out cognates
cognate = FIXED_COGNATES.get(gloss, cognate)
try:
cognate = [c.strip() for c in cognate.split("&")]
except AttributeError:
cognate = [cognate]
# handle the cases where a lexeme is given multiple cognates
if len(lex) < len(cognate):
cognate = [";".join(cognate)]
else:
cognate = [str(c) for c in cognate]
for l, cog in zip(lex, cognate):
if cog == "#":
continue
for c in cog.split(";"): # expand multiple cognates
args.writer.add_cognate(
lexeme=l,
Cognateset_ID="%s-%d" % (concepts[word], int(c)),
Source=["Lee2015"],
)