aboutsummaryrefslogtreecommitdiff
path: root/external/uk/ac/ox/cs/data/dbpedia
diff options
context:
space:
mode:
authoryzhou <yzhou@krr-linux.cs.ox.ac.uk>2015-04-30 17:36:35 +0100
committeryzhou <yzhou@krr-linux.cs.ox.ac.uk>2015-04-30 17:36:35 +0100
commit0d8f240c9c0a64f2285324e5a517161e45c698fc (patch)
treef4b4f7078e3be02011b9812cd8791c657a135993 /external/uk/ac/ox/cs/data/dbpedia
parent68ae342b2a4923bc7b3f378c6a489f2355d85279 (diff)
downloadACQuA-0d8f240c9c0a64f2285324e5a517161e45c698fc.tar.gz
ACQuA-0d8f240c9c0a64f2285324e5a517161e45c698fc.zip
downgrade owl api and reorganised src files
Diffstat (limited to 'external/uk/ac/ox/cs/data/dbpedia')
-rw-r--r--external/uk/ac/ox/cs/data/dbpedia/DataFilter.java68
-rw-r--r--external/uk/ac/ox/cs/data/dbpedia/DataFilterRDFHandler.java116
-rw-r--r--external/uk/ac/ox/cs/data/dbpedia/Normaliser.java155
3 files changed, 339 insertions, 0 deletions
diff --git a/external/uk/ac/ox/cs/data/dbpedia/DataFilter.java b/external/uk/ac/ox/cs/data/dbpedia/DataFilter.java
new file mode 100644
index 0000000..dc2f3e0
--- /dev/null
+++ b/external/uk/ac/ox/cs/data/dbpedia/DataFilter.java
@@ -0,0 +1,68 @@
1package uk.ac.ox.cs.data.dbpedia;
2
3import java.io.FileInputStream;
4import java.io.FileNotFoundException;
5import java.io.FileOutputStream;
6import java.io.IOException;
7import java.util.HashSet;
8import java.util.Set;
9
10import org.openrdf.rio.RDFHandlerException;
11import org.openrdf.rio.RDFParseException;
12import org.openrdf.rio.turtle.TurtleParser;
13import org.openrdf.rio.turtle.TurtleWriter;
14import org.semanticweb.owlapi.model.OWLAnnotationProperty;
15import org.semanticweb.owlapi.model.OWLDataProperty;
16import org.semanticweb.owlapi.model.OWLOntology;
17import uk.ac.ox.cs.pagoda.owl.OWLHelper;
18
19public class DataFilter {
20
21 public static void main(String[] args) throws FileNotFoundException {
22 filteringDBPedia();
23 }
24
25 /**
26 * Filter out data property assertions and annotation property assertions in the data set.
27 *
28 * @throws FileNotFoundException
29 */
30 private static void filteringDBPedia() throws FileNotFoundException {
31 String[] args = (
32// "/home/yzhou/ontologies/npd/npd-all.owl " +
33// "/home/yzhou/ontologies/npd/data/npd-data-dump-processed.ttl " +
34// "/home/yzhou/ontologies/npd/data/npd-data-dump-minus-datatype-new.ttl " +
35// "http://sws.ifi.uio.no/vocab/npd-all.owl#"
36
37 "/media/RDFData/yzhou/dbpedia/integratedOntology.owl " +
38 "/media/RDFData/yzhou/dbpedia/data/dbpedia-processed.ttl " +
39 "/home/yzhou/ontologies/dbpedia/data/dbpedia-minus-datatype-new.ttl " +
40 "http://dbpedia.org/ontology/"
41 ).split("\\ ");
42
43
44 OWLOntology ontology = OWLHelper.loadOntology(args[0]);
45
46 Set<String> properties2ignore = new HashSet<String>();
47 for (OWLDataProperty prop: ontology.getDataPropertiesInSignature(true))
48 properties2ignore.add(prop.toStringID());
49 for (OWLAnnotationProperty prop: ontology.getAnnotationPropertiesInSignature())
50 properties2ignore.add(prop.toStringID());
51
52 TurtleParser parser = new TurtleParser();
53 TurtleWriter writer = new TurtleWriter(new FileOutputStream(args[2]));
54
55 parser.setRDFHandler(new DataFilterRDFHandler(writer, properties2ignore));
56 try {
57 parser.parse(new FileInputStream(args[1]), args[3]);
58 } catch (RDFParseException e) {
59 e.printStackTrace();
60 } catch (RDFHandlerException e) {
61 e.printStackTrace();
62 } catch (IOException e) {
63 e.printStackTrace();
64 }
65
66 }
67
68}
diff --git a/external/uk/ac/ox/cs/data/dbpedia/DataFilterRDFHandler.java b/external/uk/ac/ox/cs/data/dbpedia/DataFilterRDFHandler.java
new file mode 100644
index 0000000..6dbac91
--- /dev/null
+++ b/external/uk/ac/ox/cs/data/dbpedia/DataFilterRDFHandler.java
@@ -0,0 +1,116 @@
1package uk.ac.ox.cs.data.dbpedia;
2
3import java.text.Normalizer;
4import java.util.Set;
5
6import org.apache.jena.iri.IRI;
7import org.apache.jena.iri.IRIException;
8import org.apache.jena.iri.IRIFactory;
9
10import org.openrdf.model.BNode;
11import org.openrdf.model.Resource;
12import org.openrdf.model.URI;
13import org.openrdf.model.Value;
14import org.openrdf.model.Literal;
15import org.openrdf.model.Statement;
16import org.openrdf.model.impl.StatementImpl;
17import org.openrdf.model.impl.URIImpl;
18import org.openrdf.rio.RDFHandler;
19import org.openrdf.rio.RDFHandlerException;
20import org.openrdf.rio.RDFWriter;
21
22public class DataFilterRDFHandler implements RDFHandler {
23
24 public static IRIFactory iriFactory = IRIFactory.semanticWebImplementation();
25
26 RDFWriter m_writer;
27 Set<String> m_properties;
28
29 public DataFilterRDFHandler(RDFWriter writer, Set<String> properties2ignore) {
30 m_writer = writer;
31 m_properties = properties2ignore;
32 }
33
34 @Override
35 public void endRDF() throws RDFHandlerException {
36 m_writer.endRDF();
37 }
38
39 @Override
40 public void handleComment(String arg0) throws RDFHandlerException {
41 m_writer.handleComment(arg0);
42 }
43
44 @Override
45 public void handleNamespace(String arg0, String arg1) throws RDFHandlerException {
46 m_writer.handleNamespace(arg0, arg1);
47 }
48
49 @Override
50 public void handleStatement(Statement arg0) throws RDFHandlerException {
51 Value newObject = null, oldObject = arg0.getObject();
52
53 if (oldObject instanceof Literal)
54 return ;
55 else if (oldObject instanceof BNode) {
56 newObject = oldObject;
57 }
58 else if (oldObject instanceof URI)
59 newObject = new URIImpl(Normalizer.normalize(oldObject.toString(), Normalizer.Form.NFKC));
60 else {
61 System.out.println("Object: " + oldObject.getClass());
62 }
63
64 String predicate = arg0.getPredicate().toString();
65 if (m_properties.contains(predicate)) return ;
66
67 Resource newSubject = null, oldSubject = arg0.getSubject();
68
69 if (oldSubject instanceof BNode) {
70 newSubject = oldSubject;
71 }
72 else if (oldSubject instanceof URI) {
73 newSubject = new URIImpl(Normalizer.normalize(oldSubject.toString(), Normalizer.Form.NFKC));
74 }
75 else {
76 System.out.println("Subject: " + oldSubject.getClass());
77 }
78
79// if (newObject.toString().contains("ns#type"))
80// System.out.println(arg0);
81
82 if (newSubject == null || newObject == null) {
83 System.out.println(arg0);
84 return ;
85 }
86
87 IRI subjectIRI, objectIRI;
88 try {
89 if (newSubject instanceof URI){
90 subjectIRI = iriFactory.construct(newSubject.toString());
91 if (subjectIRI.hasViolation(true)) {
92 System.out.println(arg0);
93 return ;
94 }
95 }
96 if (newObject instanceof URI) {
97 objectIRI = iriFactory.construct(newObject.toString());
98 if (objectIRI.hasViolation(true)) {
99 System.out.println(arg0);
100 return ;
101 }
102 }
103
104 } catch (IRIException e) {
105 return ;
106 }
107
108 m_writer.handleStatement(new StatementImpl(newSubject, arg0.getPredicate(), newObject));
109 }
110
111 @Override
112 public void startRDF() throws RDFHandlerException {
113 m_writer.startRDF();
114 }
115
116}
diff --git a/external/uk/ac/ox/cs/data/dbpedia/Normaliser.java b/external/uk/ac/ox/cs/data/dbpedia/Normaliser.java
new file mode 100644
index 0000000..e025604
--- /dev/null
+++ b/external/uk/ac/ox/cs/data/dbpedia/Normaliser.java
@@ -0,0 +1,155 @@
1package uk.ac.ox.cs.data.dbpedia;
2
3import java.io.BufferedReader;
4import java.io.BufferedWriter;
5import java.io.FileInputStream;
6import java.io.FileOutputStream;
7import java.io.IOException;
8import java.io.InputStreamReader;
9import java.io.OutputStreamWriter;
10import java.text.Normalizer;
11import java.util.HashMap;
12import java.util.HashSet;
13import java.util.Map;
14import java.util.Set;
15import java.util.regex.Pattern;
16
17public class Normaliser {
18
19 public static void main(String[] args) throws IOException {
20 if (args.length == 0) {
21 args = new String[] {
22 "/home/yzhou/ontologies/npd/npd-data-dump-minus-datatype.ttl",
23 "1"
24 };
25 }
26
27 BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(args[0])));
28 String fragment = args[0];
29 int size = Integer.valueOf(args[1]), index;
30
31 if ((index = fragment.lastIndexOf(".")) != -1) {
32 fragment = fragment.substring(0, index) + "_new_fragment" + args[1] + fragment.substring(index);
33 }
34 else fragment += "_fragment" + args[1];
35
36 BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fragment)));
37
38// simpleProcess(reader, writer, size);
39 process(reader, writer, size);
40
41 writer.close();
42 reader.close();
43 }
44
45 public static void simpleProcess(BufferedReader reader, BufferedWriter writer, int size) throws IOException {
46 String line;
47 int index = 0;
48 while ((line = reader.readLine()) != null) {
49 if (++index == size) {
50 index = 0;
51 writer.write(line);
52 writer.newLine();
53 }
54 }
55 }
56
57 static final String illegalSymbols = ",()'‘";
58 static final String[][] replacedSymbols = new String[][] {
59 {"æ", "ae"},
60 {"ø", "o"},
61 {"ß", "t"},
62 {"Ł", "L"},
63 {"ı", "i"},
64 {"ł", "l"},
65 {"–", "-"},
66 {"&", "and"},
67 {"ð", "o"},
68 {"ə", "e"},
69 {"Đ", "D"},
70 {"ħ", "h"},
71// {"%60", "_"},
72 {"đ", "d"},
73 {"Þ", "P"}
74 };
75
76 static Set<Character> symbols2remove;
77 static Map<Character, String> symbols2replace;
78
79 static {
80 symbols2remove = new HashSet<Character>();
81 for (int i = 0; i < illegalSymbols.length(); ++i)
82 symbols2remove.add(illegalSymbols.charAt(i));
83
84 symbols2replace = new HashMap<Character, String>();
85 for (int i = 0; i < replacedSymbols.length; ++i)
86 symbols2replace.put(replacedSymbols[i][0].charAt(0), replacedSymbols[i][1]);
87 }
88
89 static final String urlSymbols = "http://";
90 static final int urlSymbolLength = 7;
91
92 public static void process(BufferedReader reader, BufferedWriter writer, int size) throws IOException {
93 int index = 0;
94 String line;
95
96 String newLine;
97 while ((line = reader.readLine()) != null) {
98 if (line.contains("@"))
99 continue;
100
101 if (++index == size) {
102 newLine = process(line);
103 writer.write(deAccent(newLine.toString()));
104 writer.write('.');
105 writer.newLine();
106 index = 0;
107 }
108 }
109
110 writer.close();
111 reader.close();
112 }
113
114 private static String process(String line) {
115 line = line.replace("%60", "_");//.replace("__", "_");
116
117 int inURL = 0;
118 char ch;
119 String str;
120 StringBuilder newLine = new StringBuilder();
121 for (int i = 0; i < line.length(); ++i) {
122 ch = line.charAt(i);
123
124 if (ch == '.') {
125 if (inURL == urlSymbolLength)
126 newLine.append('.');
127 continue;
128 }
129
130 if (inURL == urlSymbolLength) {
131 if (ch == '/' || ch == '#' || ch == ')' || ch == '>') inURL = 0;
132 }
133 else if (ch == urlSymbols.charAt(inURL)) {
134 ++inURL;
135 }
136 else inURL = 0;
137
138 if ((str = symbols2replace.get(ch)) != null)
139 newLine.append(str);
140 else if (!symbols2remove.contains(ch))
141 newLine.append(ch);
142 }
143
144 return newLine.toString();
145 }
146
147 public static String deAccent(String str) {
148 String nfdNormalizedString = Normalizer.normalize(str, Normalizer.Form.NFD);
149 Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+");
150 String t = pattern.matcher(nfdNormalizedString).replaceAll("");
151 return t;
152 }
153
154
155}