aboutsummaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
authorRncLsn <rnc.lsn@gmail.com>2015-06-04 14:00:20 +0100
committerRncLsn <rnc.lsn@gmail.com>2015-06-04 14:00:20 +0100
commitb3ce74df783ebe665182dbd916a7288cff8bc127 (patch)
treebf40a35615a3148f68e4d09fe72e693b7ec8b8fc /scripts
parent8a5d02f50e96d531a867e79607e7d283a756a4ec (diff)
downloadACQuA-b3ce74df783ebe665182dbd916a7288cff8bc127.tar.gz
ACQuA-b3ce74df783ebe665182dbd916a7288cff8bc127.zip
SyGENiA query converter.
Diffstat (limited to 'scripts')
-rw-r--r--scripts/SyGENiA_query_converter.py91
1 files changed, 91 insertions, 0 deletions
diff --git a/scripts/SyGENiA_query_converter.py b/scripts/SyGENiA_query_converter.py
new file mode 100644
index 0000000..a40c05f
--- /dev/null
+++ b/scripts/SyGENiA_query_converter.py
@@ -0,0 +1,91 @@
1import sys
2import os
3from os.path import join
4import re
5import argparse
6import random
7
8
9# example query
10# Q(?0)<-takesCourse(?0,?1), Course(?1)
11
12# example namespace_pairs
13# unibench:http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl
14
15
16var_map = {'0': 'x', '1': 'y', '2': 'z', '3':'u', '4': 'v', '5': 'w'}
17atom_regex = '(?P<name>[A-Za-z]+)\(((?P<vars>\?[0-9]+(,\?[0-9]+)*))\)'
18
19
20def get_var(numeric_var, answer_vars, blank_pct=0):
21 """Given a numeric var (e.g. ?1),
22 it returns a var (e.g. ?x) or a blank node (e.g. _:x).
23 """
24
25 blank_flag = random.random() < (float(blank_pct) / 100)
26 var = '?' + var_map[numeric_var[1:]]
27 if blank_flag and var not in answer_vars:
28 return '_:' + var_map[numeric_var[1:]]
29 else:
30 return '?' + var_map[numeric_var[1:]]
31
32
33
34def parse_query(query, namespace_pair, query_id, blank_pct=0):
35 """Translates a query from FOL notation to SPARQL"""
36
37 namespace_id = namespace_pair[:namespace_pair.find(':')]
38 namespace = namespace_pair[namespace_pair.find(':') + 1:]
39
40 head, body = query.split('<-')
41 answer_vars = map(lambda x: '?' + var_map[x[1:]], head.strip()[2:-1].split(','))
42 body_atoms = map(lambda m: (m.group('name'), m.group('vars')), re.finditer(atom_regex, body))
43
44 triples = []
45 var_cache = {}
46 for name, atom_vars_str in body_atoms:
47
48 atom_vars = []
49 for x in atom_vars_str.split(','):
50 if x not in var_cache:
51 var_cache[x] = get_var(x, answer_vars, blank_pct)
52 atom_vars.append(var_cache[x])
53
54 if len(atom_vars) == 1:
55 triples.append((atom_vars[0], 'rdf:type', namespace_id + ':' + name))
56 elif len(atom_vars) == 2:
57 triples.append((atom_vars[0], namespace_id + ':' + name, atom_vars[1]))
58 else:
59 raise IOError('Predicated of arity > 2')
60
61 query_text = '^[query%d]\n' % query_id
62 query_text += 'PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>\n'
63 query_text += 'PREFIX ' +namespace_id + ': ' + namespace + '\n'
64 query_text += 'SELECT ' + ' '.join(answer_vars) + '\n'
65 query_text += 'WHERE {\n'
66 for triple in triples:
67 query_text += ' ' + ' '.join(triple) + ' .\n'
68 query_text = query_text[:-2] + '\n}'
69
70 return query_text
71
72
73if __name__ == '__main__':
74
75 parser = argparse.ArgumentParser(description='Convert queries from FOL notation to SPARQL.')
76 parser.add_argument('-b', '--blank', default='0' ,
77 help='percentage of vars to be randomly replaced with blank nodes')
78 parser.add_argument('namespace',
79 help='<id>:<namespace>, that is a colon-separated pair with an id and the namespace for all the individuals in the query')
80 parser.add_argument('input', help='<input-dir> or <input-file>, that is an input directory or a single file')
81 args = parser.parse_args()
82
83 query_id = 1
84 for input_file in os.listdir(args.input):
85 if os.path.isfile(join(args.input, input_file)):
86 with open(join(args.input, input_file), 'r') as in_file:
87 query = in_file.read()
88 parsed_query = parse_query(query, args.namespace, query_id, args.blank)
89 query_id += 1
90 print parsed_query
91 print