-
Notifications
You must be signed in to change notification settings - Fork 18
/
qald_parser.py
481 lines (374 loc) · 18.7 KB
/
qald_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
"""
This script intends to make a BIG_DATA counterpart for QALD questions.
It expects the SPARQLs to be parsed and stored in JSON format
via: https://github.com/RubenVerborgh/SPARQL.js
With the parsed SPARQLs, this will find the true path, and generate their false counterparts as well.
EXAMPLE of input:
{
u'distinct': True,
u'prefixes': {u'dbo': u'http://dbpedia.org/ontology/',
u'res': u'http://dbpedia.org/resource/'},
u'queryType': u'SELECT',
u'type': u'query',
u'variables': [u'?date'],
u'where': [{u'triples': [{u'object': u'?date',
u'predicate': u'http://dbpedia.org/ontology/date',
u'subject': u'http://dbpedia.org/resource/Battle_of_Gettysburg'}],
u'type': u'bgp'}]}
EXAMPLE of output:
{
u'_id': u'7c654505500d49bd835cc07799940fb1',
u'constraints': {
u'?uri': u'http://dbpedia.org/ontology/Politician',
u'count': True},
u'corrected_question': u'How many party leaders are there whose parties are headquartered in Berlin?',
u'entity': [u'http://dbpedia.org/resource/Berlin'],
u'path': [u'-http://dbpedia.org/ontology/headquarter', u'+http://dbpedia.org/ontology/leader'],
u'sparql_query': u'SELECT DISTINCT COUNT(?uri) WHERE {
?x <http://dbpedia.org/ontology/headquarter> <http://dbpedia.org/resource/Berlin> .
?x <http://dbpedia.org/ontology/leader> ?uri .
?uri <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/Politician> }',
u'sparql_template_id': 405,
u'verbalized_question': u'Count the number of <leader> of the <political parties> whose <admin HQ> is <Berlin>.'
}
"""
# @TODO: - handle count
import json
import pickle
import warnings
from pprint import pprint
# from utils.dbpedia_interface import DBPedia
from utils import natural_language_utilities as nlutils
# Some macros
DEBUG = True
RAW_QALD_DIR_TRAIN = './resources/qald-7-train-multilingual.json'
RAW_QALD_DIR_TEST = './resources/qald-7-test-multilingual.json'
PARSED_QALD_DIR_TRAIN = './resources/qald-7-train-parsed.pickle'
PARSED_QALD_DIR_TEST = './resources/qald-7-test-parsed.pickle'
# Global variables
# dbp = DBPedia(_verbose=True, caching=True) # Summon a DBpedia interface
# Better warning formatting. Ignore.
def better_warning(message, category, filename, lineno, file=None, line=None):
return ' %s:%s: %s:%s\n' % (filename, lineno, category.__name__, message)
def __fill_single_triple_data__(_triple, _path, _ask = False):
"""
:param _triple: the triple from whence we scrape the content
:param _path: the path variable which we fill
:param _ask: boolean (if we want to handle ask questions)
:return:
None : something went wrong
-1 : out of scope
[ * ] : regular stuff
"""
if _ask:
# Check if we have any literal
if not (nlutils.is_dbpedia_uri(_triple['subject']) and '?' != nlutils.is_dbpedia_uri(_triple['subject'])) \
and (nlutils.is_dbpedia_uri(_triple['object']) and '?' != nlutils.is_dbpedia_uri(_triple['object'])):
return -1, -1
# # We don't have any literal
# _entity = [nlutils.is_dbpedia_shorthand(_triple['subject'], _convert=True),
# nlutils.is_dbpedia_shorthand(_triple['object'], _convert=True)]
_entity = []
if nlutils.is_dbpedia_uri(_triple['subject']) and '?' != nlutils.is_dbpedia_uri(_triple['subject']):
_entity.append(nlutils.is_dbpedia_shorthand(_triple['subject'], _convert=True))
if nlutils.is_dbpedia_uri(_triple['object']) and '?' != nlutils.is_dbpedia_uri(_triple['object']):
_entity.append(nlutils.is_dbpedia_shorthand(_triple['object'], _convert=True))
if not _entity:
_entity = -1
_path.append('+' + nlutils.is_dbpedia_shorthand(_triple['predicate'], _convert=True))
return _path, _entity
# Check whether the s or r is the variable
if str(_triple['subject'][0]) == '?':
# Template gon' be: e - r
_entity = [nlutils.is_dbpedia_shorthand(_triple['object'], _convert=True)]
_path.append('-' + nlutils.is_dbpedia_shorthand(_triple['predicate'], _convert=True))
elif str(_triple['object'][0]) == '?':
# Template gon' be: e + r
_entity = [nlutils.is_dbpedia_shorthand(_triple['subject'], _convert=True)]
_path.append('+' + nlutils.is_dbpedia_shorthand(_triple['predicate'], _convert=True))
else:
warnings.warn("qald_parser:__fill_single_triple_data: Cannot find a variable anywhere. Something forked up")
return None, None
return _path, _entity
def __fill_double_triple_data__(_triples, _path, _ask=False):
"""
There is no entity in triple 1
-> check if there's a topic entity on triple 2 and go ahead with it.
There is an entity in triple 1
-> start making path using pred_triple1
-> stack the variable to find it on triple 2
-> find the var on triple 2
-> no entity there
-> set up signs (confusing ones)
-> entity there
-> chain path (easy peasy)
Returns
None : something went wrong
-1 : out of scope
[ * ] : regular stuff
"""
topic_entities = []
first_variable = ''
# if _ask:
# """
# There can be only one variable, i.e. an intermediate variable in these queries. (If at all).
# We are not handling queries with two different triples with no variables.
#
# So the only thing we're tackling it e1 p1 ?x . ?x p2 e2. We convert the other variant of this back to this
# """
# if not nlutils.is_dbpedia_uri(_triples[0]['subject']):
# _triples = [_triples[1], _triples[0]]
#
# # Handling Triple 1
# topic_entities
if not (nlutils.is_dbpedia_uri(_triples[0]['subject']) or nlutils.is_dbpedia_uri(_triples[0]['object'])):
_triples = [_triples[1], _triples[0]]
# Okay so now we have a topic entity, lets store it somewhere
if nlutils.is_dbpedia_uri(_triples[0]['subject']):
topic_entities, first_variable = [nlutils.is_dbpedia_shorthand(_triples[0]['subject'], _convert=True)], \
_triples[0]['object']
_path.append('+' + nlutils.is_dbpedia_shorthand(_triples[0]['predicate'], _convert=True))
elif nlutils.is_dbpedia_uri(_triples[0]['object']):
topic_entities, first_variable = [nlutils.is_dbpedia_shorthand(_triples[0]['object'], _convert=True)], \
_triples[0]['subject']
_path.append('-' + nlutils.is_dbpedia_shorthand(_triples[0]['predicate'], _convert=True))
else:
warnings.warn("qald_parser.__fill_double_triple_data__: Apparently there is no topic entity in all the SPARQL "
+ " query. Someone royally forked up. Dying now.")
"""
For the following SPARQL - we can land upon this condition:
WHERE {
?uri dbo:office 'President of the United States' .
?uri dbo:orderInOffice '16th' . }
We just flag it as out of scope and go ahead.
"""
return -1, -1
# Based on first_variable, try figuring out the 2nd triple.
# either first_v p2 second_v
# or first_v p2 ent_2
# or second_v p2 first_v
# or ent_2 p2 first_v
# @TODO: verify if I have covered all bases here
# Check if there an entity in Triple 2
if nlutils.is_dbpedia_uri(_triples[1]['subject']) or nlutils.is_dbpedia_uri(_triples[1]['object']):
# There is. Now verify if the other entity is the same as first_variable
if _triples[1]['subject'] == first_variable:
# [path] + <pred2>
topic_entities.append(nlutils.is_dbpedia_shorthand(_triples[1]['object'], _convert=True))
_path.append('+' + nlutils.is_dbpedia_shorthand(_triples[1]['predicate'], _convert=True))
elif _triples[1]['object'] == first_variable:
# [path] - <pred2>
topic_entities.append(nlutils.is_dbpedia_shorthand(_triples[1]['subject'], _convert=True))
_path.append('-' + nlutils.is_dbpedia_shorthand(_triples[1]['predicate'], _convert=True))
else:
# This makes no sense. In a query with two triples, we can't have two different variables and two entities
warnings.warn("qald_parser.__fill_double_triple_data__: Apparently there are two topic entities AND two "
+ "entities in this SPARQL query. Someone royally forked up.") if not _ask else warnings.warn(
"qald_parser.__fill_double_triple_data__: ASK query has a disjoint graph. WTF." )
return None, None
else:
'''
There is no entity in the second triple. Then we have two variables.
- If x rel uri
- path will be [path] + rel
- If uri rel x
- path will be [path] - rel
ASSUME THAT FIRST VARIABLE IS X *NOT* URI
'''
if _ask:
# If this happens, something's wrong. Should not have happened.
warnings.warn( "qald_parser.__fill_double_triple_data__: ASK queries can't have two variables.")
return None, None
if _triples[1]['subject'] == first_variable:
_path.append('+' + nlutils.is_dbpedia_shorthand(_triples[1]['predicate'], _convert=True))
elif _triples[1]['object'] == first_variable:
_path.append('-' + nlutils.is_dbpedia_shorthand(_triples[1]['predicate'], _convert=True))
else:
warnings.warn( "qald_parser.__fill_double_triple_data__: Looks like an invalid SPARQL. Returning nones")
return None, None
return _path, topic_entities
def scavenge_entities(_sparql):
"""
Function used to blindly find all the entities in the given parsed sparql object.
To be called if the standard parsing fails to find anything.
:param _sparql: ze parsed SPARQL dict (from JSON)
:return: list of str (entities)
"""
entities = []
try:
# Go through all the triples
for i in range(len(_sparql['where'][0]['triples'])):
triple = _sparql['where'][0]['triples'][i]
# Check if subject is a URI
if nlutils.is_dbpedia_uri(triple['subject']):
entities.append(nlutils.is_dbpedia_shorthand(triple['subject'], _convert=True))
if nlutils.is_dbpedia_uri(triple['object']):
entities.append(nlutils.is_dbpedia_shorthand(triple['object'], _convert=True))
return entities
except KeyError:
return -1
def get_true_path(sparql, raw_sparql):
"""
Check if there is one or more triples:
1 Triple:
not gonna be a rdf:type constraint. Get the sr/ro and make the path.
2 Triple:
for every triple
do a huge bunch of complicated logic
Also, if the question has orderby/filterby, do mention that the question is out of scope
:param sparql: ze parsed SPARQL json
:param raw_sparql: ze raw SPARQL string
:return:
"""
constraints = {}
entity = []
path = []
# Booleans to make life easy
has_type_constraint = False
is_ask = False
is_count = False
out_of_scope = False
# For out of scope questions, root em out, return -1 for them.
if 'optional' in raw_sparql.lower().replace('{', ' ').replace('.', '').split() or \
'union' in raw_sparql.lower().replace('{', ' ').replace('.', '').split() or \
'filter' in raw_sparql.lower().replace('{', ' ').replace('.', '').split() or \
'order' in raw_sparql.lower().replace('{', ' ').replace('.', '').split():
warnings.warn("qald_parser.get_true_path: The query is beyond the scope of this script")
return -1, scavenge_entities(sparql), {'out-of-scope': True}
# Handling keyerror "triples" i.e. there are no triples to start with
try:
temp = sparql['where'][0]['triples']
except KeyError:
warnings.warn("qald_parser.get_true_path: Cannot find any triple to begin with.")
return None, None, None
finally:
temp = None
# Detect and handle ASK questions differently.
if sparql['queryType'].lower() == 'ask':
is_ask = True
constraints['ask'] = True
# Detect and handle COUNT queries differently.
try:
temp = sparql['variables'][0]['expression']['aggregation']
if temp == 'count':
is_count = True
constraints['count'] = True
sparql['variables'] = [sparql['variables'][0]['expression']['expression']]
except (TypeError, KeyError) as e:
pass
if len(sparql['where'][0]['triples']) == 1:
path, entity = __fill_single_triple_data__(_triple=sparql['where'][0]['triples'][0], _path=path, _ask=is_ask)
elif len(sparql['where'][0]['triples']) == 2:
'''
-> Find if there is a type constraint
-> if so, then on which variable
-> Assign a topic entity.
'''
# Find (if any) the triple with rdf type constraint
for triple in sparql['where'][0]['triples']:
if triple['predicate'] in ['a', 'rdf:type', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type']:
has_type_constraint = True
# Found it. Figure out what is being constrained.
try:
if triple['subject'] in sparql['variables']:
constraints['?uri'] = triple['object'] # The constraint is on the uri
else:
constraints['?x'] = triple['object']
except KeyError:
# This is a 2level ASK query with an rdf type constraint. Can only be on interim variable.
constraints['?x'] = triple['object']
if has_type_constraint:
# It means that there is only one triple with real data. That can be taken care of easily.
for triple in sparql['where'][0]['triples']:
if not triple['predicate'] in ['a', 'rdf:type', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type']:
path, entity = __fill_single_triple_data__(_triple=triple, _path=path, _ask=is_ask)
break
else:
# It is a two triple query, but with no rdf:type constraint and we need to parse it the hard way
path, entity = __fill_double_triple_data__(_triples=sparql['where'][0]['triples'], _path=path, _ask=is_ask)
# else:
#
# '''
# The Question has more than two triples.
# Handle it iff:
# one of the triple is an rdf contstraint.
# '''
#
# # Find (if any) the triple with rdf type constraint
# for triple in sparql['where'][0]['triples']:
#
# if triple['predicate'] in ['a', 'rdf:type', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type']:
#
# has_type_constraint = True
# # Found it. Figure out what is being constrained.
# try:
# if triple['subject'] in sparql['variables']:
# constraints['?uri'] = triple['object'] # The constraint is on the uri
# else:
# constraints['?x'] = triple['object']
# except KeyError:
# # This is a 2level ASK query with an rdf type constraint. Can only be on interim variable.
# constraints['?x'] = triple['object']
#
# if not has_type_constraint:
# return [-1, -1]
elif len(sparql['where'][0]['triples']) == 3:
'''
Handle this ONLY if one of the triples is an RDF constraint.
-> Check if we have an rdf constraint here.
-> if yes:
- parse it and separate it from the triples. Send the rest to __fill_double_triple_data__
'''
for triple in sparql['where'][0]['triples']:
if triple['predicate'] in ['a', 'rdf:type', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type']:
has_type_constraint = True
try:
if triple['subject'] in sparql['variables']:
constraints['?uri'] = triple['object'] # The constraint is on the uri
else:
constraints['?x'] = triple['object']
except KeyError:
# This is a 2level ASK query with an rdf type constraint. Can only be on interim variable.
constraints['?x'] = triple['object']
# Pop it out of the list of triples and parse the rest
triples = sparql['where'][0]['triples'][:]
triples.pop(triples.index(triple))
path, entity = __fill_double_triple_data__(_triples=triples, _path=path, _ask=is_ask)
if not has_type_constraint:
warnings.warn("No code in place for queries with three triples with *NO* rdf:type constraint")
return -1, scavenge_entities(sparql), {'out-of-scope': True}
else:
warnings.warn("No code in place for queries with more than three triples")
return -1, scavenge_entities(sparql), {'out-of-scope': True}
# Before any return condition, check if anything is None. If so, something somewhere forked up and handle it well.
if entity == -1 or entity == None:
entity = scavenge_entities(sparql)
return path, entity, constraints
def run():
# Load QALD
raw_dataset = json.load(open(RAW_QALD_DIR_TEST))['questions']
parsed_dataset = pickle.load(open(PARSED_QALD_DIR_TEST))
paths = []
# Iterate through every question
for i in range(len(raw_dataset)):
# Get the QALD question
q_raw = raw_dataset[i]
q_parsed = parsed_dataset[i]
if DEBUG:
print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>")
print(q_raw['query']['sparql'])
# # Get answer for the query
# ans = dbp.get_answer(q_raw['query']['sparql'])
# true_path, topic_entities = get_true_path(q_parsed, q_raw['query']['sparql'])
data = get_true_path(q_parsed, q_raw['query']['sparql'])
paths.append(data)
# false_paths = get_false_paths(ans, true_path)
# if DEBUG:
# pprint(true_path)
# pprint(topic_entities)
# print("<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
# raw_input("Press enter to continue")
return paths
if __name__ == "__main__":
paths = run()
pickle.dump(paths,open('resources/paths','w+'))