From 0d8f240c9c0a64f2285324e5a517161e45c698fc Mon Sep 17 00:00:00 2001 From: yzhou Date: Thu, 30 Apr 2015 17:36:35 +0100 Subject: downgrade owl api and reorganised src files --- external/uk/ac/ox/cs/data/dbpedia/DataFilter.java | 68 +++++++++ .../ox/cs/data/dbpedia/DataFilterRDFHandler.java | 116 +++++++++++++++ external/uk/ac/ox/cs/data/dbpedia/Normaliser.java | 155 +++++++++++++++++++++ 3 files changed, 339 insertions(+) create mode 100644 external/uk/ac/ox/cs/data/dbpedia/DataFilter.java create mode 100644 external/uk/ac/ox/cs/data/dbpedia/DataFilterRDFHandler.java create mode 100644 external/uk/ac/ox/cs/data/dbpedia/Normaliser.java (limited to 'external/uk/ac/ox/cs/data/dbpedia') diff --git a/external/uk/ac/ox/cs/data/dbpedia/DataFilter.java b/external/uk/ac/ox/cs/data/dbpedia/DataFilter.java new file mode 100644 index 0000000..dc2f3e0 --- /dev/null +++ b/external/uk/ac/ox/cs/data/dbpedia/DataFilter.java @@ -0,0 +1,68 @@ +package uk.ac.ox.cs.data.dbpedia; + +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; + +import org.openrdf.rio.RDFHandlerException; +import org.openrdf.rio.RDFParseException; +import org.openrdf.rio.turtle.TurtleParser; +import org.openrdf.rio.turtle.TurtleWriter; +import org.semanticweb.owlapi.model.OWLAnnotationProperty; +import org.semanticweb.owlapi.model.OWLDataProperty; +import org.semanticweb.owlapi.model.OWLOntology; +import uk.ac.ox.cs.pagoda.owl.OWLHelper; + +public class DataFilter { + + public static void main(String[] args) throws FileNotFoundException { + filteringDBPedia(); + } + + /** + * Filter out data property assertions and annotation property assertions in the data set. + * + * @throws FileNotFoundException + */ + private static void filteringDBPedia() throws FileNotFoundException { + String[] args = ( +// "/home/yzhou/ontologies/npd/npd-all.owl " + +// "/home/yzhou/ontologies/npd/data/npd-data-dump-processed.ttl " + +// "/home/yzhou/ontologies/npd/data/npd-data-dump-minus-datatype-new.ttl " + +// "http://sws.ifi.uio.no/vocab/npd-all.owl#" + + "/media/RDFData/yzhou/dbpedia/integratedOntology.owl " + + "/media/RDFData/yzhou/dbpedia/data/dbpedia-processed.ttl " + + "/home/yzhou/ontologies/dbpedia/data/dbpedia-minus-datatype-new.ttl " + + "http://dbpedia.org/ontology/" + ).split("\\ "); + + + OWLOntology ontology = OWLHelper.loadOntology(args[0]); + + Set properties2ignore = new HashSet(); + for (OWLDataProperty prop: ontology.getDataPropertiesInSignature(true)) + properties2ignore.add(prop.toStringID()); + for (OWLAnnotationProperty prop: ontology.getAnnotationPropertiesInSignature()) + properties2ignore.add(prop.toStringID()); + + TurtleParser parser = new TurtleParser(); + TurtleWriter writer = new TurtleWriter(new FileOutputStream(args[2])); + + parser.setRDFHandler(new DataFilterRDFHandler(writer, properties2ignore)); + try { + parser.parse(new FileInputStream(args[1]), args[3]); + } catch (RDFParseException e) { + e.printStackTrace(); + } catch (RDFHandlerException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + + } + +} diff --git a/external/uk/ac/ox/cs/data/dbpedia/DataFilterRDFHandler.java b/external/uk/ac/ox/cs/data/dbpedia/DataFilterRDFHandler.java new file mode 100644 index 0000000..6dbac91 --- /dev/null +++ b/external/uk/ac/ox/cs/data/dbpedia/DataFilterRDFHandler.java @@ -0,0 +1,116 @@ +package uk.ac.ox.cs.data.dbpedia; + +import java.text.Normalizer; +import java.util.Set; + +import org.apache.jena.iri.IRI; +import org.apache.jena.iri.IRIException; +import org.apache.jena.iri.IRIFactory; + +import org.openrdf.model.BNode; +import org.openrdf.model.Resource; +import org.openrdf.model.URI; +import org.openrdf.model.Value; +import org.openrdf.model.Literal; +import org.openrdf.model.Statement; +import org.openrdf.model.impl.StatementImpl; +import org.openrdf.model.impl.URIImpl; +import org.openrdf.rio.RDFHandler; +import org.openrdf.rio.RDFHandlerException; +import org.openrdf.rio.RDFWriter; + +public class DataFilterRDFHandler implements RDFHandler { + + public static IRIFactory iriFactory = IRIFactory.semanticWebImplementation(); + + RDFWriter m_writer; + Set m_properties; + + public DataFilterRDFHandler(RDFWriter writer, Set properties2ignore) { + m_writer = writer; + m_properties = properties2ignore; + } + + @Override + public void endRDF() throws RDFHandlerException { + m_writer.endRDF(); + } + + @Override + public void handleComment(String arg0) throws RDFHandlerException { + m_writer.handleComment(arg0); + } + + @Override + public void handleNamespace(String arg0, String arg1) throws RDFHandlerException { + m_writer.handleNamespace(arg0, arg1); + } + + @Override + public void handleStatement(Statement arg0) throws RDFHandlerException { + Value newObject = null, oldObject = arg0.getObject(); + + if (oldObject instanceof Literal) + return ; + else if (oldObject instanceof BNode) { + newObject = oldObject; + } + else if (oldObject instanceof URI) + newObject = new URIImpl(Normalizer.normalize(oldObject.toString(), Normalizer.Form.NFKC)); + else { + System.out.println("Object: " + oldObject.getClass()); + } + + String predicate = arg0.getPredicate().toString(); + if (m_properties.contains(predicate)) return ; + + Resource newSubject = null, oldSubject = arg0.getSubject(); + + if (oldSubject instanceof BNode) { + newSubject = oldSubject; + } + else if (oldSubject instanceof URI) { + newSubject = new URIImpl(Normalizer.normalize(oldSubject.toString(), Normalizer.Form.NFKC)); + } + else { + System.out.println("Subject: " + oldSubject.getClass()); + } + +// if (newObject.toString().contains("ns#type")) +// System.out.println(arg0); + + if (newSubject == null || newObject == null) { + System.out.println(arg0); + return ; + } + + IRI subjectIRI, objectIRI; + try { + if (newSubject instanceof URI){ + subjectIRI = iriFactory.construct(newSubject.toString()); + if (subjectIRI.hasViolation(true)) { + System.out.println(arg0); + return ; + } + } + if (newObject instanceof URI) { + objectIRI = iriFactory.construct(newObject.toString()); + if (objectIRI.hasViolation(true)) { + System.out.println(arg0); + return ; + } + } + + } catch (IRIException e) { + return ; + } + + m_writer.handleStatement(new StatementImpl(newSubject, arg0.getPredicate(), newObject)); + } + + @Override + public void startRDF() throws RDFHandlerException { + m_writer.startRDF(); + } + +} diff --git a/external/uk/ac/ox/cs/data/dbpedia/Normaliser.java b/external/uk/ac/ox/cs/data/dbpedia/Normaliser.java new file mode 100644 index 0000000..e025604 --- /dev/null +++ b/external/uk/ac/ox/cs/data/dbpedia/Normaliser.java @@ -0,0 +1,155 @@ +package uk.ac.ox.cs.data.dbpedia; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.text.Normalizer; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.regex.Pattern; + +public class Normaliser { + + public static void main(String[] args) throws IOException { + if (args.length == 0) { + args = new String[] { + "/home/yzhou/ontologies/npd/npd-data-dump-minus-datatype.ttl", + "1" + }; + } + + BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(args[0]))); + String fragment = args[0]; + int size = Integer.valueOf(args[1]), index; + + if ((index = fragment.lastIndexOf(".")) != -1) { + fragment = fragment.substring(0, index) + "_new_fragment" + args[1] + fragment.substring(index); + } + else fragment += "_fragment" + args[1]; + + BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fragment))); + +// simpleProcess(reader, writer, size); + process(reader, writer, size); + + writer.close(); + reader.close(); + } + + public static void simpleProcess(BufferedReader reader, BufferedWriter writer, int size) throws IOException { + String line; + int index = 0; + while ((line = reader.readLine()) != null) { + if (++index == size) { + index = 0; + writer.write(line); + writer.newLine(); + } + } + } + + static final String illegalSymbols = ",()'‘"; + static final String[][] replacedSymbols = new String[][] { + {"æ", "ae"}, + {"ø", "o"}, + {"ß", "t"}, + {"Ł", "L"}, + {"ı", "i"}, + {"ł", "l"}, + {"–", "-"}, + {"&", "and"}, + {"ð", "o"}, + {"ə", "e"}, + {"Đ", "D"}, + {"ħ", "h"}, +// {"%60", "_"}, + {"đ", "d"}, + {"Þ", "P"} + }; + + static Set symbols2remove; + static Map symbols2replace; + + static { + symbols2remove = new HashSet(); + for (int i = 0; i < illegalSymbols.length(); ++i) + symbols2remove.add(illegalSymbols.charAt(i)); + + symbols2replace = new HashMap(); + for (int i = 0; i < replacedSymbols.length; ++i) + symbols2replace.put(replacedSymbols[i][0].charAt(0), replacedSymbols[i][1]); + } + + static final String urlSymbols = "http://"; + static final int urlSymbolLength = 7; + + public static void process(BufferedReader reader, BufferedWriter writer, int size) throws IOException { + int index = 0; + String line; + + String newLine; + while ((line = reader.readLine()) != null) { + if (line.contains("@")) + continue; + + if (++index == size) { + newLine = process(line); + writer.write(deAccent(newLine.toString())); + writer.write('.'); + writer.newLine(); + index = 0; + } + } + + writer.close(); + reader.close(); + } + + private static String process(String line) { + line = line.replace("%60", "_");//.replace("__", "_"); + + int inURL = 0; + char ch; + String str; + StringBuilder newLine = new StringBuilder(); + for (int i = 0; i < line.length(); ++i) { + ch = line.charAt(i); + + if (ch == '.') { + if (inURL == urlSymbolLength) + newLine.append('.'); + continue; + } + + if (inURL == urlSymbolLength) { + if (ch == '/' || ch == '#' || ch == ')' || ch == '>') inURL = 0; + } + else if (ch == urlSymbols.charAt(inURL)) { + ++inURL; + } + else inURL = 0; + + if ((str = symbols2replace.get(ch)) != null) + newLine.append(str); + else if (!symbols2remove.contains(ch)) + newLine.append(ch); + } + + return newLine.toString(); + } + + public static String deAccent(String str) { + String nfdNormalizedString = Normalizer.normalize(str, Normalizer.Form.NFD); + Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+"); + String t = pattern.matcher(nfdNormalizedString).replaceAll(""); + return t; + } + + +} -- cgit v1.2.3