From 0d8f240c9c0a64f2285324e5a517161e45c698fc Mon Sep 17 00:00:00 2001 From: yzhou Date: Thu, 30 Apr 2015 17:36:35 +0100 Subject: downgrade owl api and reorganised src files --- test/uk/ac/ox/cs/data/sample/DataSampling.java | 320 ---------------------- test/uk/ac/ox/cs/data/sample/RandomWalk.java | 88 ------ test/uk/ac/ox/cs/data/sample/RandomWalkMulti.java | 112 -------- test/uk/ac/ox/cs/data/sample/Sampler.java | 23 -- 4 files changed, 543 deletions(-) delete mode 100644 test/uk/ac/ox/cs/data/sample/DataSampling.java delete mode 100644 test/uk/ac/ox/cs/data/sample/RandomWalk.java delete mode 100644 test/uk/ac/ox/cs/data/sample/RandomWalkMulti.java delete mode 100644 test/uk/ac/ox/cs/data/sample/Sampler.java (limited to 'test/uk/ac/ox/cs/data/sample') diff --git a/test/uk/ac/ox/cs/data/sample/DataSampling.java b/test/uk/ac/ox/cs/data/sample/DataSampling.java deleted file mode 100644 index 1a788e3..0000000 --- a/test/uk/ac/ox/cs/data/sample/DataSampling.java +++ /dev/null @@ -1,320 +0,0 @@ -package uk.ac.ox.cs.data.sample; - -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.IOException; -import java.util.*; -import java.util.Map.Entry; - -import org.openrdf.model.Resource; -import org.openrdf.model.Statement; -import org.openrdf.model.URI; -import org.openrdf.model.Value; -import org.openrdf.model.impl.StatementImpl; -import org.openrdf.model.impl.URIImpl; -import org.openrdf.rio.RDFHandler; -import org.openrdf.rio.RDFHandlerException; -import org.openrdf.rio.RDFParseException; -import org.openrdf.rio.RDFParser; -import org.openrdf.rio.ntriples.NTriplesParser; -import org.openrdf.rio.turtle.*; - -import uk.ac.ox.cs.pagoda.owl.OWLHelper; -import uk.ac.ox.cs.pagoda.util.Namespace; -import uk.ac.ox.cs.pagoda.util.Utility; - -public class DataSampling { - - File[] m_list; - RDFGraph m_graph; - double m_percentage; - Set excludeEntities = new HashSet(); - - public DataSampling(String prefix, String fileName, String excludeFile, double percentage, boolean inTurtle) { - if (excludeFile != null) { - try { - Scanner scanner = new Scanner(new File(excludeFile)); - while (scanner.hasNextLine()) - excludeEntities.add(OWLHelper.removeAngles(scanner.nextLine().trim())); - scanner.close(); - } catch (FileNotFoundException e1) { - // TODO Auto-generated catch block - e1.printStackTrace(); - } - } - excludeEntities.add("http://www.w3.org/2002/07/owl#imports"); - - File file = new File(fileName); - if (file.isDirectory()) m_list = file.listFiles(); - else m_list = new File[] {file}; - m_percentage = percentage; - - RDFParser parser = inTurtle ? new TurtleParser() : new NTriplesParser(); - - GraphRDFHandler handler = new GraphRDFHandler(excludeEntities); - parser.setRDFHandler(handler); - - FileInputStream istream; - try { - for (File tFile: m_list) { - parser.parse(istream = new FileInputStream(tFile), prefix); - istream.close(); - } - } catch (IOException e) { - e.printStackTrace(); - } catch (RDFParseException e) { - e.printStackTrace(); - } catch (RDFHandlerException e) { - e.printStackTrace(); - } - - m_graph = handler.getGraph(); - } - - public void sample(String outputFile, boolean multiStart) { - try { - FileOutputStream ostream = new FileOutputStream(outputFile); - TurtleWriter writer = new TurtleWriter(ostream); - writer.startRDF(); - - if (m_percentage < 100) { - Sampler sam = multiStart ? - new RandomWalkMulti(m_graph, writer) : - new RandomWalk(m_graph, writer); - sam.setLimit((int) (m_graph.numberOfStatement / 100 * m_percentage)); - System.out.println("Statement limit: " + (m_graph.numberOfStatement / 100 * m_percentage)); - sam.sample(); - sam.dispose(); - } - else { - m_graph.visit(writer); - } - writer.endRDF(); - ostream.close(); - } catch (IOException e) { - e.printStackTrace(); - } catch (RDFHandlerException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - } - - public static void main(String[] args) { - sampleReactome(); -// sampleChEMBL(); - } - - static void sampleReactome() { -// double[] ps = {40, 70, 100}; - double[] ps = {0.25, 0.5, 0.75}; - for (double per: ps) { - DataSampling sampling = new DataSampling( - "http://www.biopax.org/release/biopax-level3.owl#", -// "/home/yzhou/krr-nas-share/Yujiao/ontologies/bio2rdf/reactome/data/data.ttl", - "/home/yzhou/krr-nas-share/Yujiao/ontologies/bio2rdf/reactome/data/simplified.nt", -// "/home/yzhou/krr-nas-share/Yujiao/ontologies/bio2rdf/reactome/graph sampling/reactome_sample_40.ttl", - "/home/yzhou/krr-nas-share/Yujiao/ontologies/bio2rdf/reactome/graph sampling/exclude", - per, - true); - sampling.sample("/home/yzhou/krr-nas-share/Yujiao/ontologies/bio2rdf/reactome/graph sampling/sample_test_" + per + ".ttl", true); -// sampling.sample("/home/yzhou/krr-nas-share/Yujiao/ontologies/bio2rdf/reactome/graph sampling/simplifed_sample_test_" + per + ".ttl", true); -// sampling.sample("output/sample_reactome_multi.ttl", true); - } - } - - static void sampleChEMBL() { - DataSampling sampling = new DataSampling( - "http://rdf.ebi.ac.uk/terms/chembl", - "/home/yzhou/RDFdata/ChEMBL/facts/chembl_kbfile.nt", - null, - 100, - false); - - sampling.sample("output/sample_chembl_multi.ttl", true); - sampling.sample("output/sample_chembl.ttl", false); - } - -} - -class RDFGraph { - - Map index = new HashMap(); - Map inverseIndex = new HashMap(); - MapToList labels = new MapToList(); - - MapToList edges = new MapToList(); - Set excludeEntities; - - int numberOfIndividuals = 0, numberOfProperties = 0; - - public RDFGraph(Set exclude) { - excludeEntities = exclude; - for (String str: excludeEntities) - System.out.println(str); - System.out.println("---------------"); - } - - public void visit(TurtleWriter writer) throws RDFHandlerException { - Integer key; - for (Entry> entry: labels.entrySet()) { - key = entry.getKey(); - for (Integer type: entry.getValue()) - writer.handleStatement(getStatement(key, type)); - } - - for (Entry> entry: edges.entrySet()) { - key = entry.getKey(); - if ((inverseIndex.get(key) instanceof URI) && - ((URI) inverseIndex.get(key)).toString().equals("http://www.reactome.org/biopax/46/879693#UnificationXref9")) - System.out.println("Here"); - - for (RDFEdge edge: entry.getValue()) - writer.handleStatement(getStatement(key, edge.m_label, edge.m_dst)); - } - } - - private int getID(Value v, boolean isIndividual) { - if (v.toString().contains("imports")) - System.out.println(v.toString()); - if (excludeEntities.contains(v.toString())) { - return 0; - } - - Integer id = index.get(v); - if (id == null) - if (isIndividual) { - index.put(v, id = ++numberOfIndividuals); - inverseIndex.put(id, v); - } - else { - index.put(v, id = --numberOfProperties); - inverseIndex.put(id, v); - } - return id; - } - - int numberOfStatement = 0; - int counter = 0; - - public void addTriple(Resource s, URI p, Value o) { - ++numberOfStatement; - if (numberOfStatement % 1000000 == 0) { - Utility.logInfo("No.of statements: " + numberOfStatement, "\tNo.of individuals: " + numberOfIndividuals, "\tNo.of predicates: " + (-numberOfProperties)); - } - - if (p.equals(rdftype)) { - int type = getID(o, false), i = getID(s, true); - if (i == 0) { -// System.out.println("<" + s + "> <" + p + "> <" + o + ">"); - return ; - } - labels.add(i, type); - } - else { - int i = getID(s, true), j = getID(o, true), prop = getID(p, false) ; - if (i == 0 || j == 0 || prop == 0) { -// System.out.println("<" + s + "> <" + p + "> <" + o + ">"); - return ; - } - edges.add(i, new RDFEdge(prop, j)); - } - } - - URI rdftype = new URIImpl(Namespace.RDF_TYPE); - - public Statement getStatement(int... args) { - if (args.length == 2) - return new StatementImpl((Resource) inverseIndex.get(args[0]), rdftype, (Value) inverseIndex.get(args[1])); - else if (args.length == 3) - return new StatementImpl((Resource) inverseIndex.get(args[0]), (URI) inverseIndex.get(args[1]), (Value) inverseIndex.get(args[2])); - return null; - } - - public String getRawString(int id) { - return inverseIndex.get(id).toString(); - } - -} - -class MapToList { - - private Map> map = new HashMap>(); - - public void add(int key, T value) { - LinkedList list = map.get(key); - if (list == null) - map.put(key, list = new LinkedList()); - list.add(value); - } - - public Set>> entrySet() { - return map.entrySet(); - } - - public void shuffle() { - for (List list: map.values()) - Collections.shuffle(list); - } - - public LinkedList get(int key) { - return map.get(key); - } - -} - -class RDFEdge { - - int m_label, m_dst; - - public RDFEdge(int label, int dst) { - m_label = label; - m_dst = dst; - } - -} - -class GraphRDFHandler implements RDFHandler { - - RDFGraph m_graph; - Set m_exclude; - - public GraphRDFHandler(Set excludeEntities) { - m_exclude = excludeEntities; - } - - @Override - public void startRDF() throws RDFHandlerException { - m_graph = new RDFGraph(m_exclude); - } - - public RDFGraph getGraph() { - return m_graph; - } - - @Override - public void endRDF() throws RDFHandlerException { - // TODO Auto-generated method stub - - } - - @Override - public void handleNamespace(String prefix, String uri) - throws RDFHandlerException { - // TODO Auto-generated method stub - - } - - @Override - public void handleStatement(Statement st) throws RDFHandlerException { - m_graph.addTriple(st.getSubject(), st.getPredicate(), st.getObject()); - } - - @Override - public void handleComment(String comment) throws RDFHandlerException { - // TODO Auto-generated method stub - - } - -} \ No newline at end of file diff --git a/test/uk/ac/ox/cs/data/sample/RandomWalk.java b/test/uk/ac/ox/cs/data/sample/RandomWalk.java deleted file mode 100644 index d9f5107..0000000 --- a/test/uk/ac/ox/cs/data/sample/RandomWalk.java +++ /dev/null @@ -1,88 +0,0 @@ -package uk.ac.ox.cs.data.sample; - -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.Random; -import java.util.Set; -import java.util.Stack; - -import org.openrdf.rio.RDFHandlerException; -import org.openrdf.rio.turtle.TurtleWriter; - -import uk.ac.ox.cs.pagoda.util.Utility; - -public class RandomWalk extends Sampler { - - public RandomWalk(RDFGraph graph, TurtleWriter writer) { - super(graph, writer); - } - - protected Random rand = new Random(); - - protected int noOfStatements = 0, statementLimit = 0; - protected Set visited = new HashSet(); - - @Override - public void setLimit(int limit) { - statementLimit = limit; - } - - @Override - public void sample() throws RDFHandlerException { - int u, v, pick, index; - RDFEdge edge; - List edges; - Stack stack = new Stack(); - while (true) { - if (noOfStatements >= statementLimit) { - return ; - } - if (stack.isEmpty()) { - stack.add(v = rand.nextInt(m_graph.numberOfIndividuals)); - Utility.logInfo("A new start: " + m_graph.getRawString(v)); - visit(v); - } - u = stack.peek(); - if (rand.nextInt(100) < 15) { - stack.pop(); - continue; - } - if ((edges = m_graph.edges.get(u)) == null || edges.size() == 0) { - stack.clear(); - continue; - } - - index = 0; - pick = rand.nextInt(edges.size()); - for (Iterator iter = edges.iterator(); iter.hasNext(); ++index) { - edge = iter.next(); - if (index == pick) { - stack.add(v = edge.m_dst); - visit(v); - m_writer.handleStatement(m_graph.getStatement(u, edge.m_label, edge.m_dst)); - ++noOfStatements; - iter.remove(); - } - - } - } - } - - protected void visit(int node) throws RDFHandlerException { - if (visited.contains(node)) return ; - visited.add(node); - List list = m_graph.labels.get(node); - if (list == null) return ; - for (Iterator iter = list.iterator(); iter.hasNext(); ) - m_writer.handleStatement(m_graph.getStatement(node, iter.next())); - noOfStatements += list.size(); - } - - @Override - public void dispose() { - visited.clear(); - } - - -} diff --git a/test/uk/ac/ox/cs/data/sample/RandomWalkMulti.java b/test/uk/ac/ox/cs/data/sample/RandomWalkMulti.java deleted file mode 100644 index 592f249..0000000 --- a/test/uk/ac/ox/cs/data/sample/RandomWalkMulti.java +++ /dev/null @@ -1,112 +0,0 @@ -package uk.ac.ox.cs.data.sample; - -import java.util.HashSet; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Queue; -import java.util.Set; -import java.util.Stack; - -import org.openrdf.rio.RDFHandlerException; -import org.openrdf.rio.turtle.TurtleWriter; - -import uk.ac.ox.cs.pagoda.util.Utility; - - -public class RandomWalkMulti extends RandomWalk { - - public RandomWalkMulti(RDFGraph graph, TurtleWriter writer) { - super(graph, writer); - } - - Queue queue = new LinkedList(); - - @Override - public void sample() throws RDFHandlerException { - getStartNodes(); - - Utility.logInfo(queue.size()); - - int u, v, pick, index; - int individualLimit = statementLimit / queue.size(), currentLimit = 0; - RDFEdge edge; - List edges; - Stack stack = new Stack(); - while (true) { - if (noOfStatements >= statementLimit) { - System.out.println("The number of statements in the sampling: " + noOfStatements); - return ; - } - if (noOfStatements >= currentLimit) { - stack.clear(); - } - - if (stack.isEmpty()) { - if (queue.isEmpty()) - v = rand.nextInt(m_graph.numberOfIndividuals); - else { - v = queue.poll(); - currentLimit += individualLimit; - } - stack.add(v); -// Utility.logInfo(noOfStart + " new start: " + m_graph.getRawString(v)); - visit(v); - } - u = stack.peek(); - if (rand.nextInt(100) < 15) { - stack.pop(); - continue; - } - if ((edges = m_graph.edges.get(u)) == null || edges.size() == 0) { - stack.clear(); - continue; - } - - index = 0; - pick = rand.nextInt(edges.size()); - for (Iterator iter = edges.iterator(); iter.hasNext(); ++index) { - edge = iter.next(); - if (index == pick) { - stack.add(v = edge.m_dst); - visit(v); - m_writer.handleStatement(m_graph.getStatement(u, edge.m_label, edge.m_dst)); - ++noOfStatements; - iter.remove(); - } - - } - } - } - - private void getStartNodes() throws RDFHandlerException { - Set coveredConcepts = new HashSet(); - Integer concept; - - Iterator iter; - for (Map.Entry> entry: m_graph.labels.entrySet()) { - iter = entry.getValue().iterator(); - concept = null; - - while (iter.hasNext()) { - if (!(coveredConcepts.contains(concept = iter.next()))) { - break; - } - else concept = null; - - } - - if (concept == null) continue; - else { - queue.add(entry.getKey()); - coveredConcepts.add(concept); - while (iter.hasNext()) - coveredConcepts.add(iter.next()); - } - } - - } - - -} diff --git a/test/uk/ac/ox/cs/data/sample/Sampler.java b/test/uk/ac/ox/cs/data/sample/Sampler.java deleted file mode 100644 index 205b29b..0000000 --- a/test/uk/ac/ox/cs/data/sample/Sampler.java +++ /dev/null @@ -1,23 +0,0 @@ -package uk.ac.ox.cs.data.sample; - -import org.openrdf.rio.RDFHandlerException; -import org.openrdf.rio.turtle.TurtleWriter; - -public abstract class Sampler { - - protected RDFGraph m_graph; - protected TurtleWriter m_writer; - - public Sampler(RDFGraph graph, TurtleWriter writer) { - m_graph = graph; - m_writer = writer; - } - - public abstract void setLimit(int limit); - - public abstract void sample() throws RDFHandlerException; - - public abstract void dispose(); - - -} -- cgit v1.2.3