diff options
| author | yzhou <yzhou@krr-linux.cs.ox.ac.uk> | 2015-04-30 17:36:35 +0100 |
|---|---|---|
| committer | yzhou <yzhou@krr-linux.cs.ox.ac.uk> | 2015-04-30 17:36:35 +0100 |
| commit | 0d8f240c9c0a64f2285324e5a517161e45c698fc (patch) | |
| tree | f4b4f7078e3be02011b9812cd8791c657a135993 /external/uk/ac/ox/cs/data/dbpedia | |
| parent | 68ae342b2a4923bc7b3f378c6a489f2355d85279 (diff) | |
| download | ACQuA-0d8f240c9c0a64f2285324e5a517161e45c698fc.tar.gz ACQuA-0d8f240c9c0a64f2285324e5a517161e45c698fc.zip | |
downgrade owl api and reorganised src files
Diffstat (limited to 'external/uk/ac/ox/cs/data/dbpedia')
| -rw-r--r-- | external/uk/ac/ox/cs/data/dbpedia/DataFilter.java | 68 | ||||
| -rw-r--r-- | external/uk/ac/ox/cs/data/dbpedia/DataFilterRDFHandler.java | 116 | ||||
| -rw-r--r-- | external/uk/ac/ox/cs/data/dbpedia/Normaliser.java | 155 |
3 files changed, 339 insertions, 0 deletions
diff --git a/external/uk/ac/ox/cs/data/dbpedia/DataFilter.java b/external/uk/ac/ox/cs/data/dbpedia/DataFilter.java new file mode 100644 index 0000000..dc2f3e0 --- /dev/null +++ b/external/uk/ac/ox/cs/data/dbpedia/DataFilter.java | |||
| @@ -0,0 +1,68 @@ | |||
| 1 | package uk.ac.ox.cs.data.dbpedia; | ||
| 2 | |||
| 3 | import java.io.FileInputStream; | ||
| 4 | import java.io.FileNotFoundException; | ||
| 5 | import java.io.FileOutputStream; | ||
| 6 | import java.io.IOException; | ||
| 7 | import java.util.HashSet; | ||
| 8 | import java.util.Set; | ||
| 9 | |||
| 10 | import org.openrdf.rio.RDFHandlerException; | ||
| 11 | import org.openrdf.rio.RDFParseException; | ||
| 12 | import org.openrdf.rio.turtle.TurtleParser; | ||
| 13 | import org.openrdf.rio.turtle.TurtleWriter; | ||
| 14 | import org.semanticweb.owlapi.model.OWLAnnotationProperty; | ||
| 15 | import org.semanticweb.owlapi.model.OWLDataProperty; | ||
| 16 | import org.semanticweb.owlapi.model.OWLOntology; | ||
| 17 | import uk.ac.ox.cs.pagoda.owl.OWLHelper; | ||
| 18 | |||
| 19 | public class DataFilter { | ||
| 20 | |||
| 21 | public static void main(String[] args) throws FileNotFoundException { | ||
| 22 | filteringDBPedia(); | ||
| 23 | } | ||
| 24 | |||
| 25 | /** | ||
| 26 | * Filter out data property assertions and annotation property assertions in the data set. | ||
| 27 | * | ||
| 28 | * @throws FileNotFoundException | ||
| 29 | */ | ||
| 30 | private static void filteringDBPedia() throws FileNotFoundException { | ||
| 31 | String[] args = ( | ||
| 32 | // "/home/yzhou/ontologies/npd/npd-all.owl " + | ||
| 33 | // "/home/yzhou/ontologies/npd/data/npd-data-dump-processed.ttl " + | ||
| 34 | // "/home/yzhou/ontologies/npd/data/npd-data-dump-minus-datatype-new.ttl " + | ||
| 35 | // "http://sws.ifi.uio.no/vocab/npd-all.owl#" | ||
| 36 | |||
| 37 | "/media/RDFData/yzhou/dbpedia/integratedOntology.owl " + | ||
| 38 | "/media/RDFData/yzhou/dbpedia/data/dbpedia-processed.ttl " + | ||
| 39 | "/home/yzhou/ontologies/dbpedia/data/dbpedia-minus-datatype-new.ttl " + | ||
| 40 | "http://dbpedia.org/ontology/" | ||
| 41 | ).split("\\ "); | ||
| 42 | |||
| 43 | |||
| 44 | OWLOntology ontology = OWLHelper.loadOntology(args[0]); | ||
| 45 | |||
| 46 | Set<String> properties2ignore = new HashSet<String>(); | ||
| 47 | for (OWLDataProperty prop: ontology.getDataPropertiesInSignature(true)) | ||
| 48 | properties2ignore.add(prop.toStringID()); | ||
| 49 | for (OWLAnnotationProperty prop: ontology.getAnnotationPropertiesInSignature()) | ||
| 50 | properties2ignore.add(prop.toStringID()); | ||
| 51 | |||
| 52 | TurtleParser parser = new TurtleParser(); | ||
| 53 | TurtleWriter writer = new TurtleWriter(new FileOutputStream(args[2])); | ||
| 54 | |||
| 55 | parser.setRDFHandler(new DataFilterRDFHandler(writer, properties2ignore)); | ||
| 56 | try { | ||
| 57 | parser.parse(new FileInputStream(args[1]), args[3]); | ||
| 58 | } catch (RDFParseException e) { | ||
| 59 | e.printStackTrace(); | ||
| 60 | } catch (RDFHandlerException e) { | ||
| 61 | e.printStackTrace(); | ||
| 62 | } catch (IOException e) { | ||
| 63 | e.printStackTrace(); | ||
| 64 | } | ||
| 65 | |||
| 66 | } | ||
| 67 | |||
| 68 | } | ||
diff --git a/external/uk/ac/ox/cs/data/dbpedia/DataFilterRDFHandler.java b/external/uk/ac/ox/cs/data/dbpedia/DataFilterRDFHandler.java new file mode 100644 index 0000000..6dbac91 --- /dev/null +++ b/external/uk/ac/ox/cs/data/dbpedia/DataFilterRDFHandler.java | |||
| @@ -0,0 +1,116 @@ | |||
| 1 | package uk.ac.ox.cs.data.dbpedia; | ||
| 2 | |||
| 3 | import java.text.Normalizer; | ||
| 4 | import java.util.Set; | ||
| 5 | |||
| 6 | import org.apache.jena.iri.IRI; | ||
| 7 | import org.apache.jena.iri.IRIException; | ||
| 8 | import org.apache.jena.iri.IRIFactory; | ||
| 9 | |||
| 10 | import org.openrdf.model.BNode; | ||
| 11 | import org.openrdf.model.Resource; | ||
| 12 | import org.openrdf.model.URI; | ||
| 13 | import org.openrdf.model.Value; | ||
| 14 | import org.openrdf.model.Literal; | ||
| 15 | import org.openrdf.model.Statement; | ||
| 16 | import org.openrdf.model.impl.StatementImpl; | ||
| 17 | import org.openrdf.model.impl.URIImpl; | ||
| 18 | import org.openrdf.rio.RDFHandler; | ||
| 19 | import org.openrdf.rio.RDFHandlerException; | ||
| 20 | import org.openrdf.rio.RDFWriter; | ||
| 21 | |||
| 22 | public class DataFilterRDFHandler implements RDFHandler { | ||
| 23 | |||
| 24 | public static IRIFactory iriFactory = IRIFactory.semanticWebImplementation(); | ||
| 25 | |||
| 26 | RDFWriter m_writer; | ||
| 27 | Set<String> m_properties; | ||
| 28 | |||
| 29 | public DataFilterRDFHandler(RDFWriter writer, Set<String> properties2ignore) { | ||
| 30 | m_writer = writer; | ||
| 31 | m_properties = properties2ignore; | ||
| 32 | } | ||
| 33 | |||
| 34 | @Override | ||
| 35 | public void endRDF() throws RDFHandlerException { | ||
| 36 | m_writer.endRDF(); | ||
| 37 | } | ||
| 38 | |||
| 39 | @Override | ||
| 40 | public void handleComment(String arg0) throws RDFHandlerException { | ||
| 41 | m_writer.handleComment(arg0); | ||
| 42 | } | ||
| 43 | |||
| 44 | @Override | ||
| 45 | public void handleNamespace(String arg0, String arg1) throws RDFHandlerException { | ||
| 46 | m_writer.handleNamespace(arg0, arg1); | ||
| 47 | } | ||
| 48 | |||
| 49 | @Override | ||
| 50 | public void handleStatement(Statement arg0) throws RDFHandlerException { | ||
| 51 | Value newObject = null, oldObject = arg0.getObject(); | ||
| 52 | |||
| 53 | if (oldObject instanceof Literal) | ||
| 54 | return ; | ||
| 55 | else if (oldObject instanceof BNode) { | ||
| 56 | newObject = oldObject; | ||
| 57 | } | ||
| 58 | else if (oldObject instanceof URI) | ||
| 59 | newObject = new URIImpl(Normalizer.normalize(oldObject.toString(), Normalizer.Form.NFKC)); | ||
| 60 | else { | ||
| 61 | System.out.println("Object: " + oldObject.getClass()); | ||
| 62 | } | ||
| 63 | |||
| 64 | String predicate = arg0.getPredicate().toString(); | ||
| 65 | if (m_properties.contains(predicate)) return ; | ||
| 66 | |||
| 67 | Resource newSubject = null, oldSubject = arg0.getSubject(); | ||
| 68 | |||
| 69 | if (oldSubject instanceof BNode) { | ||
| 70 | newSubject = oldSubject; | ||
| 71 | } | ||
| 72 | else if (oldSubject instanceof URI) { | ||
| 73 | newSubject = new URIImpl(Normalizer.normalize(oldSubject.toString(), Normalizer.Form.NFKC)); | ||
| 74 | } | ||
| 75 | else { | ||
| 76 | System.out.println("Subject: " + oldSubject.getClass()); | ||
| 77 | } | ||
| 78 | |||
| 79 | // if (newObject.toString().contains("ns#type")) | ||
| 80 | // System.out.println(arg0); | ||
| 81 | |||
| 82 | if (newSubject == null || newObject == null) { | ||
| 83 | System.out.println(arg0); | ||
| 84 | return ; | ||
| 85 | } | ||
| 86 | |||
| 87 | IRI subjectIRI, objectIRI; | ||
| 88 | try { | ||
| 89 | if (newSubject instanceof URI){ | ||
| 90 | subjectIRI = iriFactory.construct(newSubject.toString()); | ||
| 91 | if (subjectIRI.hasViolation(true)) { | ||
| 92 | System.out.println(arg0); | ||
| 93 | return ; | ||
| 94 | } | ||
| 95 | } | ||
| 96 | if (newObject instanceof URI) { | ||
| 97 | objectIRI = iriFactory.construct(newObject.toString()); | ||
| 98 | if (objectIRI.hasViolation(true)) { | ||
| 99 | System.out.println(arg0); | ||
| 100 | return ; | ||
| 101 | } | ||
| 102 | } | ||
| 103 | |||
| 104 | } catch (IRIException e) { | ||
| 105 | return ; | ||
| 106 | } | ||
| 107 | |||
| 108 | m_writer.handleStatement(new StatementImpl(newSubject, arg0.getPredicate(), newObject)); | ||
| 109 | } | ||
| 110 | |||
| 111 | @Override | ||
| 112 | public void startRDF() throws RDFHandlerException { | ||
| 113 | m_writer.startRDF(); | ||
| 114 | } | ||
| 115 | |||
| 116 | } | ||
diff --git a/external/uk/ac/ox/cs/data/dbpedia/Normaliser.java b/external/uk/ac/ox/cs/data/dbpedia/Normaliser.java new file mode 100644 index 0000000..e025604 --- /dev/null +++ b/external/uk/ac/ox/cs/data/dbpedia/Normaliser.java | |||
| @@ -0,0 +1,155 @@ | |||
| 1 | package uk.ac.ox.cs.data.dbpedia; | ||
| 2 | |||
| 3 | import java.io.BufferedReader; | ||
| 4 | import java.io.BufferedWriter; | ||
| 5 | import java.io.FileInputStream; | ||
| 6 | import java.io.FileOutputStream; | ||
| 7 | import java.io.IOException; | ||
| 8 | import java.io.InputStreamReader; | ||
| 9 | import java.io.OutputStreamWriter; | ||
| 10 | import java.text.Normalizer; | ||
| 11 | import java.util.HashMap; | ||
| 12 | import java.util.HashSet; | ||
| 13 | import java.util.Map; | ||
| 14 | import java.util.Set; | ||
| 15 | import java.util.regex.Pattern; | ||
| 16 | |||
| 17 | public class Normaliser { | ||
| 18 | |||
| 19 | public static void main(String[] args) throws IOException { | ||
| 20 | if (args.length == 0) { | ||
| 21 | args = new String[] { | ||
| 22 | "/home/yzhou/ontologies/npd/npd-data-dump-minus-datatype.ttl", | ||
| 23 | "1" | ||
| 24 | }; | ||
| 25 | } | ||
| 26 | |||
| 27 | BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(args[0]))); | ||
| 28 | String fragment = args[0]; | ||
| 29 | int size = Integer.valueOf(args[1]), index; | ||
| 30 | |||
| 31 | if ((index = fragment.lastIndexOf(".")) != -1) { | ||
| 32 | fragment = fragment.substring(0, index) + "_new_fragment" + args[1] + fragment.substring(index); | ||
| 33 | } | ||
| 34 | else fragment += "_fragment" + args[1]; | ||
| 35 | |||
| 36 | BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fragment))); | ||
| 37 | |||
| 38 | // simpleProcess(reader, writer, size); | ||
| 39 | process(reader, writer, size); | ||
| 40 | |||
| 41 | writer.close(); | ||
| 42 | reader.close(); | ||
| 43 | } | ||
| 44 | |||
| 45 | public static void simpleProcess(BufferedReader reader, BufferedWriter writer, int size) throws IOException { | ||
| 46 | String line; | ||
| 47 | int index = 0; | ||
| 48 | while ((line = reader.readLine()) != null) { | ||
| 49 | if (++index == size) { | ||
| 50 | index = 0; | ||
| 51 | writer.write(line); | ||
| 52 | writer.newLine(); | ||
| 53 | } | ||
| 54 | } | ||
| 55 | } | ||
| 56 | |||
| 57 | static final String illegalSymbols = ",()'‘"; | ||
| 58 | static final String[][] replacedSymbols = new String[][] { | ||
| 59 | {"æ", "ae"}, | ||
| 60 | {"ø", "o"}, | ||
| 61 | {"ß", "t"}, | ||
| 62 | {"Ł", "L"}, | ||
| 63 | {"ı", "i"}, | ||
| 64 | {"ł", "l"}, | ||
| 65 | {"–", "-"}, | ||
| 66 | {"&", "and"}, | ||
| 67 | {"ð", "o"}, | ||
| 68 | {"ə", "e"}, | ||
| 69 | {"Đ", "D"}, | ||
| 70 | {"ħ", "h"}, | ||
| 71 | // {"%60", "_"}, | ||
| 72 | {"đ", "d"}, | ||
| 73 | {"Þ", "P"} | ||
| 74 | }; | ||
| 75 | |||
| 76 | static Set<Character> symbols2remove; | ||
| 77 | static Map<Character, String> symbols2replace; | ||
| 78 | |||
| 79 | static { | ||
| 80 | symbols2remove = new HashSet<Character>(); | ||
| 81 | for (int i = 0; i < illegalSymbols.length(); ++i) | ||
| 82 | symbols2remove.add(illegalSymbols.charAt(i)); | ||
| 83 | |||
| 84 | symbols2replace = new HashMap<Character, String>(); | ||
| 85 | for (int i = 0; i < replacedSymbols.length; ++i) | ||
| 86 | symbols2replace.put(replacedSymbols[i][0].charAt(0), replacedSymbols[i][1]); | ||
| 87 | } | ||
| 88 | |||
| 89 | static final String urlSymbols = "http://"; | ||
| 90 | static final int urlSymbolLength = 7; | ||
| 91 | |||
| 92 | public static void process(BufferedReader reader, BufferedWriter writer, int size) throws IOException { | ||
| 93 | int index = 0; | ||
| 94 | String line; | ||
| 95 | |||
| 96 | String newLine; | ||
| 97 | while ((line = reader.readLine()) != null) { | ||
| 98 | if (line.contains("@")) | ||
| 99 | continue; | ||
| 100 | |||
| 101 | if (++index == size) { | ||
| 102 | newLine = process(line); | ||
| 103 | writer.write(deAccent(newLine.toString())); | ||
| 104 | writer.write('.'); | ||
| 105 | writer.newLine(); | ||
| 106 | index = 0; | ||
| 107 | } | ||
| 108 | } | ||
| 109 | |||
| 110 | writer.close(); | ||
| 111 | reader.close(); | ||
| 112 | } | ||
| 113 | |||
| 114 | private static String process(String line) { | ||
| 115 | line = line.replace("%60", "_");//.replace("__", "_"); | ||
| 116 | |||
| 117 | int inURL = 0; | ||
| 118 | char ch; | ||
| 119 | String str; | ||
| 120 | StringBuilder newLine = new StringBuilder(); | ||
| 121 | for (int i = 0; i < line.length(); ++i) { | ||
| 122 | ch = line.charAt(i); | ||
| 123 | |||
| 124 | if (ch == '.') { | ||
| 125 | if (inURL == urlSymbolLength) | ||
| 126 | newLine.append('.'); | ||
| 127 | continue; | ||
| 128 | } | ||
| 129 | |||
| 130 | if (inURL == urlSymbolLength) { | ||
| 131 | if (ch == '/' || ch == '#' || ch == ')' || ch == '>') inURL = 0; | ||
| 132 | } | ||
| 133 | else if (ch == urlSymbols.charAt(inURL)) { | ||
| 134 | ++inURL; | ||
| 135 | } | ||
| 136 | else inURL = 0; | ||
| 137 | |||
| 138 | if ((str = symbols2replace.get(ch)) != null) | ||
| 139 | newLine.append(str); | ||
| 140 | else if (!symbols2remove.contains(ch)) | ||
| 141 | newLine.append(ch); | ||
| 142 | } | ||
| 143 | |||
| 144 | return newLine.toString(); | ||
| 145 | } | ||
| 146 | |||
| 147 | public static String deAccent(String str) { | ||
| 148 | String nfdNormalizedString = Normalizer.normalize(str, Normalizer.Form.NFD); | ||
| 149 | Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+"); | ||
| 150 | String t = pattern.matcher(nfdNormalizedString).replaceAll(""); | ||
| 151 | return t; | ||
| 152 | } | ||
| 153 | |||
| 154 | |||
| 155 | } | ||
