From 0d8f240c9c0a64f2285324e5a517161e45c698fc Mon Sep 17 00:00:00 2001 From: yzhou Date: Thu, 30 Apr 2015 17:36:35 +0100 Subject: downgrade owl api and reorganised src files --- test/uk/ac/ox/cs/data/dbpedia/DataFilter.java | 68 --------- .../ox/cs/data/dbpedia/DataFilterRDFHandler.java | 116 --------------- test/uk/ac/ox/cs/data/dbpedia/Normaliser.java | 155 --------------------- 3 files changed, 339 deletions(-) delete mode 100644 test/uk/ac/ox/cs/data/dbpedia/DataFilter.java delete mode 100644 test/uk/ac/ox/cs/data/dbpedia/DataFilterRDFHandler.java delete mode 100644 test/uk/ac/ox/cs/data/dbpedia/Normaliser.java (limited to 'test/uk/ac/ox/cs/data/dbpedia') diff --git a/test/uk/ac/ox/cs/data/dbpedia/DataFilter.java b/test/uk/ac/ox/cs/data/dbpedia/DataFilter.java deleted file mode 100644 index dc2f3e0..0000000 --- a/test/uk/ac/ox/cs/data/dbpedia/DataFilter.java +++ /dev/null @@ -1,68 +0,0 @@ -package uk.ac.ox.cs.data.dbpedia; - -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.IOException; -import java.util.HashSet; -import java.util.Set; - -import org.openrdf.rio.RDFHandlerException; -import org.openrdf.rio.RDFParseException; -import org.openrdf.rio.turtle.TurtleParser; -import org.openrdf.rio.turtle.TurtleWriter; -import org.semanticweb.owlapi.model.OWLAnnotationProperty; -import org.semanticweb.owlapi.model.OWLDataProperty; -import org.semanticweb.owlapi.model.OWLOntology; -import uk.ac.ox.cs.pagoda.owl.OWLHelper; - -public class DataFilter { - - public static void main(String[] args) throws FileNotFoundException { - filteringDBPedia(); - } - - /** - * Filter out data property assertions and annotation property assertions in the data set. - * - * @throws FileNotFoundException - */ - private static void filteringDBPedia() throws FileNotFoundException { - String[] args = ( -// "/home/yzhou/ontologies/npd/npd-all.owl " + -// "/home/yzhou/ontologies/npd/data/npd-data-dump-processed.ttl " + -// "/home/yzhou/ontologies/npd/data/npd-data-dump-minus-datatype-new.ttl " + -// "http://sws.ifi.uio.no/vocab/npd-all.owl#" - - "/media/RDFData/yzhou/dbpedia/integratedOntology.owl " + - "/media/RDFData/yzhou/dbpedia/data/dbpedia-processed.ttl " + - "/home/yzhou/ontologies/dbpedia/data/dbpedia-minus-datatype-new.ttl " + - "http://dbpedia.org/ontology/" - ).split("\\ "); - - - OWLOntology ontology = OWLHelper.loadOntology(args[0]); - - Set properties2ignore = new HashSet(); - for (OWLDataProperty prop: ontology.getDataPropertiesInSignature(true)) - properties2ignore.add(prop.toStringID()); - for (OWLAnnotationProperty prop: ontology.getAnnotationPropertiesInSignature()) - properties2ignore.add(prop.toStringID()); - - TurtleParser parser = new TurtleParser(); - TurtleWriter writer = new TurtleWriter(new FileOutputStream(args[2])); - - parser.setRDFHandler(new DataFilterRDFHandler(writer, properties2ignore)); - try { - parser.parse(new FileInputStream(args[1]), args[3]); - } catch (RDFParseException e) { - e.printStackTrace(); - } catch (RDFHandlerException e) { - e.printStackTrace(); - } catch (IOException e) { - e.printStackTrace(); - } - - } - -} diff --git a/test/uk/ac/ox/cs/data/dbpedia/DataFilterRDFHandler.java b/test/uk/ac/ox/cs/data/dbpedia/DataFilterRDFHandler.java deleted file mode 100644 index 6dbac91..0000000 --- a/test/uk/ac/ox/cs/data/dbpedia/DataFilterRDFHandler.java +++ /dev/null @@ -1,116 +0,0 @@ -package uk.ac.ox.cs.data.dbpedia; - -import java.text.Normalizer; -import java.util.Set; - -import org.apache.jena.iri.IRI; -import org.apache.jena.iri.IRIException; -import org.apache.jena.iri.IRIFactory; - -import org.openrdf.model.BNode; -import org.openrdf.model.Resource; -import org.openrdf.model.URI; -import org.openrdf.model.Value; -import org.openrdf.model.Literal; -import org.openrdf.model.Statement; -import org.openrdf.model.impl.StatementImpl; -import org.openrdf.model.impl.URIImpl; -import org.openrdf.rio.RDFHandler; -import org.openrdf.rio.RDFHandlerException; -import org.openrdf.rio.RDFWriter; - -public class DataFilterRDFHandler implements RDFHandler { - - public static IRIFactory iriFactory = IRIFactory.semanticWebImplementation(); - - RDFWriter m_writer; - Set m_properties; - - public DataFilterRDFHandler(RDFWriter writer, Set properties2ignore) { - m_writer = writer; - m_properties = properties2ignore; - } - - @Override - public void endRDF() throws RDFHandlerException { - m_writer.endRDF(); - } - - @Override - public void handleComment(String arg0) throws RDFHandlerException { - m_writer.handleComment(arg0); - } - - @Override - public void handleNamespace(String arg0, String arg1) throws RDFHandlerException { - m_writer.handleNamespace(arg0, arg1); - } - - @Override - public void handleStatement(Statement arg0) throws RDFHandlerException { - Value newObject = null, oldObject = arg0.getObject(); - - if (oldObject instanceof Literal) - return ; - else if (oldObject instanceof BNode) { - newObject = oldObject; - } - else if (oldObject instanceof URI) - newObject = new URIImpl(Normalizer.normalize(oldObject.toString(), Normalizer.Form.NFKC)); - else { - System.out.println("Object: " + oldObject.getClass()); - } - - String predicate = arg0.getPredicate().toString(); - if (m_properties.contains(predicate)) return ; - - Resource newSubject = null, oldSubject = arg0.getSubject(); - - if (oldSubject instanceof BNode) { - newSubject = oldSubject; - } - else if (oldSubject instanceof URI) { - newSubject = new URIImpl(Normalizer.normalize(oldSubject.toString(), Normalizer.Form.NFKC)); - } - else { - System.out.println("Subject: " + oldSubject.getClass()); - } - -// if (newObject.toString().contains("ns#type")) -// System.out.println(arg0); - - if (newSubject == null || newObject == null) { - System.out.println(arg0); - return ; - } - - IRI subjectIRI, objectIRI; - try { - if (newSubject instanceof URI){ - subjectIRI = iriFactory.construct(newSubject.toString()); - if (subjectIRI.hasViolation(true)) { - System.out.println(arg0); - return ; - } - } - if (newObject instanceof URI) { - objectIRI = iriFactory.construct(newObject.toString()); - if (objectIRI.hasViolation(true)) { - System.out.println(arg0); - return ; - } - } - - } catch (IRIException e) { - return ; - } - - m_writer.handleStatement(new StatementImpl(newSubject, arg0.getPredicate(), newObject)); - } - - @Override - public void startRDF() throws RDFHandlerException { - m_writer.startRDF(); - } - -} diff --git a/test/uk/ac/ox/cs/data/dbpedia/Normaliser.java b/test/uk/ac/ox/cs/data/dbpedia/Normaliser.java deleted file mode 100644 index e025604..0000000 --- a/test/uk/ac/ox/cs/data/dbpedia/Normaliser.java +++ /dev/null @@ -1,155 +0,0 @@ -package uk.ac.ox.cs.data.dbpedia; - -import java.io.BufferedReader; -import java.io.BufferedWriter; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.OutputStreamWriter; -import java.text.Normalizer; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; -import java.util.regex.Pattern; - -public class Normaliser { - - public static void main(String[] args) throws IOException { - if (args.length == 0) { - args = new String[] { - "/home/yzhou/ontologies/npd/npd-data-dump-minus-datatype.ttl", - "1" - }; - } - - BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(args[0]))); - String fragment = args[0]; - int size = Integer.valueOf(args[1]), index; - - if ((index = fragment.lastIndexOf(".")) != -1) { - fragment = fragment.substring(0, index) + "_new_fragment" + args[1] + fragment.substring(index); - } - else fragment += "_fragment" + args[1]; - - BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fragment))); - -// simpleProcess(reader, writer, size); - process(reader, writer, size); - - writer.close(); - reader.close(); - } - - public static void simpleProcess(BufferedReader reader, BufferedWriter writer, int size) throws IOException { - String line; - int index = 0; - while ((line = reader.readLine()) != null) { - if (++index == size) { - index = 0; - writer.write(line); - writer.newLine(); - } - } - } - - static final String illegalSymbols = ",()'‘"; - static final String[][] replacedSymbols = new String[][] { - {"æ", "ae"}, - {"ø", "o"}, - {"ß", "t"}, - {"Ł", "L"}, - {"ı", "i"}, - {"ł", "l"}, - {"–", "-"}, - {"&", "and"}, - {"ð", "o"}, - {"ə", "e"}, - {"Đ", "D"}, - {"ħ", "h"}, -// {"%60", "_"}, - {"đ", "d"}, - {"Þ", "P"} - }; - - static Set symbols2remove; - static Map symbols2replace; - - static { - symbols2remove = new HashSet(); - for (int i = 0; i < illegalSymbols.length(); ++i) - symbols2remove.add(illegalSymbols.charAt(i)); - - symbols2replace = new HashMap(); - for (int i = 0; i < replacedSymbols.length; ++i) - symbols2replace.put(replacedSymbols[i][0].charAt(0), replacedSymbols[i][1]); - } - - static final String urlSymbols = "http://"; - static final int urlSymbolLength = 7; - - public static void process(BufferedReader reader, BufferedWriter writer, int size) throws IOException { - int index = 0; - String line; - - String newLine; - while ((line = reader.readLine()) != null) { - if (line.contains("@")) - continue; - - if (++index == size) { - newLine = process(line); - writer.write(deAccent(newLine.toString())); - writer.write('.'); - writer.newLine(); - index = 0; - } - } - - writer.close(); - reader.close(); - } - - private static String process(String line) { - line = line.replace("%60", "_");//.replace("__", "_"); - - int inURL = 0; - char ch; - String str; - StringBuilder newLine = new StringBuilder(); - for (int i = 0; i < line.length(); ++i) { - ch = line.charAt(i); - - if (ch == '.') { - if (inURL == urlSymbolLength) - newLine.append('.'); - continue; - } - - if (inURL == urlSymbolLength) { - if (ch == '/' || ch == '#' || ch == ')' || ch == '>') inURL = 0; - } - else if (ch == urlSymbols.charAt(inURL)) { - ++inURL; - } - else inURL = 0; - - if ((str = symbols2replace.get(ch)) != null) - newLine.append(str); - else if (!symbols2remove.contains(ch)) - newLine.append(ch); - } - - return newLine.toString(); - } - - public static String deAccent(String str) { - String nfdNormalizedString = Normalizer.normalize(str, Normalizer.Form.NFD); - Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+"); - String t = pattern.matcher(nfdNormalizedString).replaceAll(""); - return t; - } - - -} -- cgit v1.2.3