From 0d8f240c9c0a64f2285324e5a517161e45c698fc Mon Sep 17 00:00:00 2001 From: yzhou Date: Thu, 30 Apr 2015 17:36:35 +0100 Subject: downgrade owl api and reorganised src files --- external/uk/ac/ox/cs/data/sample/DataSampling.java | 320 +++++++++++++++++++++ external/uk/ac/ox/cs/data/sample/RandomWalk.java | 88 ++++++ .../uk/ac/ox/cs/data/sample/RandomWalkMulti.java | 112 ++++++++ external/uk/ac/ox/cs/data/sample/Sampler.java | 23 ++ 4 files changed, 543 insertions(+) create mode 100644 external/uk/ac/ox/cs/data/sample/DataSampling.java create mode 100644 external/uk/ac/ox/cs/data/sample/RandomWalk.java create mode 100644 external/uk/ac/ox/cs/data/sample/RandomWalkMulti.java create mode 100644 external/uk/ac/ox/cs/data/sample/Sampler.java (limited to 'external/uk/ac/ox/cs/data/sample') diff --git a/external/uk/ac/ox/cs/data/sample/DataSampling.java b/external/uk/ac/ox/cs/data/sample/DataSampling.java new file mode 100644 index 0000000..1a788e3 --- /dev/null +++ b/external/uk/ac/ox/cs/data/sample/DataSampling.java @@ -0,0 +1,320 @@ +package uk.ac.ox.cs.data.sample; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.*; +import java.util.Map.Entry; + +import org.openrdf.model.Resource; +import org.openrdf.model.Statement; +import org.openrdf.model.URI; +import org.openrdf.model.Value; +import org.openrdf.model.impl.StatementImpl; +import org.openrdf.model.impl.URIImpl; +import org.openrdf.rio.RDFHandler; +import org.openrdf.rio.RDFHandlerException; +import org.openrdf.rio.RDFParseException; +import org.openrdf.rio.RDFParser; +import org.openrdf.rio.ntriples.NTriplesParser; +import org.openrdf.rio.turtle.*; + +import uk.ac.ox.cs.pagoda.owl.OWLHelper; +import uk.ac.ox.cs.pagoda.util.Namespace; +import uk.ac.ox.cs.pagoda.util.Utility; + +public class DataSampling { + + File[] m_list; + RDFGraph m_graph; + double m_percentage; + Set excludeEntities = new HashSet(); + + public DataSampling(String prefix, String fileName, String excludeFile, double percentage, boolean inTurtle) { + if (excludeFile != null) { + try { + Scanner scanner = new Scanner(new File(excludeFile)); + while (scanner.hasNextLine()) + excludeEntities.add(OWLHelper.removeAngles(scanner.nextLine().trim())); + scanner.close(); + } catch (FileNotFoundException e1) { + // TODO Auto-generated catch block + e1.printStackTrace(); + } + } + excludeEntities.add("http://www.w3.org/2002/07/owl#imports"); + + File file = new File(fileName); + if (file.isDirectory()) m_list = file.listFiles(); + else m_list = new File[] {file}; + m_percentage = percentage; + + RDFParser parser = inTurtle ? new TurtleParser() : new NTriplesParser(); + + GraphRDFHandler handler = new GraphRDFHandler(excludeEntities); + parser.setRDFHandler(handler); + + FileInputStream istream; + try { + for (File tFile: m_list) { + parser.parse(istream = new FileInputStream(tFile), prefix); + istream.close(); + } + } catch (IOException e) { + e.printStackTrace(); + } catch (RDFParseException e) { + e.printStackTrace(); + } catch (RDFHandlerException e) { + e.printStackTrace(); + } + + m_graph = handler.getGraph(); + } + + public void sample(String outputFile, boolean multiStart) { + try { + FileOutputStream ostream = new FileOutputStream(outputFile); + TurtleWriter writer = new TurtleWriter(ostream); + writer.startRDF(); + + if (m_percentage < 100) { + Sampler sam = multiStart ? + new RandomWalkMulti(m_graph, writer) : + new RandomWalk(m_graph, writer); + sam.setLimit((int) (m_graph.numberOfStatement / 100 * m_percentage)); + System.out.println("Statement limit: " + (m_graph.numberOfStatement / 100 * m_percentage)); + sam.sample(); + sam.dispose(); + } + else { + m_graph.visit(writer); + } + writer.endRDF(); + ostream.close(); + } catch (IOException e) { + e.printStackTrace(); + } catch (RDFHandlerException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + public static void main(String[] args) { + sampleReactome(); +// sampleChEMBL(); + } + + static void sampleReactome() { +// double[] ps = {40, 70, 100}; + double[] ps = {0.25, 0.5, 0.75}; + for (double per: ps) { + DataSampling sampling = new DataSampling( + "http://www.biopax.org/release/biopax-level3.owl#", +// "/home/yzhou/krr-nas-share/Yujiao/ontologies/bio2rdf/reactome/data/data.ttl", + "/home/yzhou/krr-nas-share/Yujiao/ontologies/bio2rdf/reactome/data/simplified.nt", +// "/home/yzhou/krr-nas-share/Yujiao/ontologies/bio2rdf/reactome/graph sampling/reactome_sample_40.ttl", + "/home/yzhou/krr-nas-share/Yujiao/ontologies/bio2rdf/reactome/graph sampling/exclude", + per, + true); + sampling.sample("/home/yzhou/krr-nas-share/Yujiao/ontologies/bio2rdf/reactome/graph sampling/sample_test_" + per + ".ttl", true); +// sampling.sample("/home/yzhou/krr-nas-share/Yujiao/ontologies/bio2rdf/reactome/graph sampling/simplifed_sample_test_" + per + ".ttl", true); +// sampling.sample("output/sample_reactome_multi.ttl", true); + } + } + + static void sampleChEMBL() { + DataSampling sampling = new DataSampling( + "http://rdf.ebi.ac.uk/terms/chembl", + "/home/yzhou/RDFdata/ChEMBL/facts/chembl_kbfile.nt", + null, + 100, + false); + + sampling.sample("output/sample_chembl_multi.ttl", true); + sampling.sample("output/sample_chembl.ttl", false); + } + +} + +class RDFGraph { + + Map index = new HashMap(); + Map inverseIndex = new HashMap(); + MapToList labels = new MapToList(); + + MapToList edges = new MapToList(); + Set excludeEntities; + + int numberOfIndividuals = 0, numberOfProperties = 0; + + public RDFGraph(Set exclude) { + excludeEntities = exclude; + for (String str: excludeEntities) + System.out.println(str); + System.out.println("---------------"); + } + + public void visit(TurtleWriter writer) throws RDFHandlerException { + Integer key; + for (Entry> entry: labels.entrySet()) { + key = entry.getKey(); + for (Integer type: entry.getValue()) + writer.handleStatement(getStatement(key, type)); + } + + for (Entry> entry: edges.entrySet()) { + key = entry.getKey(); + if ((inverseIndex.get(key) instanceof URI) && + ((URI) inverseIndex.get(key)).toString().equals("http://www.reactome.org/biopax/46/879693#UnificationXref9")) + System.out.println("Here"); + + for (RDFEdge edge: entry.getValue()) + writer.handleStatement(getStatement(key, edge.m_label, edge.m_dst)); + } + } + + private int getID(Value v, boolean isIndividual) { + if (v.toString().contains("imports")) + System.out.println(v.toString()); + if (excludeEntities.contains(v.toString())) { + return 0; + } + + Integer id = index.get(v); + if (id == null) + if (isIndividual) { + index.put(v, id = ++numberOfIndividuals); + inverseIndex.put(id, v); + } + else { + index.put(v, id = --numberOfProperties); + inverseIndex.put(id, v); + } + return id; + } + + int numberOfStatement = 0; + int counter = 0; + + public void addTriple(Resource s, URI p, Value o) { + ++numberOfStatement; + if (numberOfStatement % 1000000 == 0) { + Utility.logInfo("No.of statements: " + numberOfStatement, "\tNo.of individuals: " + numberOfIndividuals, "\tNo.of predicates: " + (-numberOfProperties)); + } + + if (p.equals(rdftype)) { + int type = getID(o, false), i = getID(s, true); + if (i == 0) { +// System.out.println("<" + s + "> <" + p + "> <" + o + ">"); + return ; + } + labels.add(i, type); + } + else { + int i = getID(s, true), j = getID(o, true), prop = getID(p, false) ; + if (i == 0 || j == 0 || prop == 0) { +// System.out.println("<" + s + "> <" + p + "> <" + o + ">"); + return ; + } + edges.add(i, new RDFEdge(prop, j)); + } + } + + URI rdftype = new URIImpl(Namespace.RDF_TYPE); + + public Statement getStatement(int... args) { + if (args.length == 2) + return new StatementImpl((Resource) inverseIndex.get(args[0]), rdftype, (Value) inverseIndex.get(args[1])); + else if (args.length == 3) + return new StatementImpl((Resource) inverseIndex.get(args[0]), (URI) inverseIndex.get(args[1]), (Value) inverseIndex.get(args[2])); + return null; + } + + public String getRawString(int id) { + return inverseIndex.get(id).toString(); + } + +} + +class MapToList { + + private Map> map = new HashMap>(); + + public void add(int key, T value) { + LinkedList list = map.get(key); + if (list == null) + map.put(key, list = new LinkedList()); + list.add(value); + } + + public Set>> entrySet() { + return map.entrySet(); + } + + public void shuffle() { + for (List list: map.values()) + Collections.shuffle(list); + } + + public LinkedList get(int key) { + return map.get(key); + } + +} + +class RDFEdge { + + int m_label, m_dst; + + public RDFEdge(int label, int dst) { + m_label = label; + m_dst = dst; + } + +} + +class GraphRDFHandler implements RDFHandler { + + RDFGraph m_graph; + Set m_exclude; + + public GraphRDFHandler(Set excludeEntities) { + m_exclude = excludeEntities; + } + + @Override + public void startRDF() throws RDFHandlerException { + m_graph = new RDFGraph(m_exclude); + } + + public RDFGraph getGraph() { + return m_graph; + } + + @Override + public void endRDF() throws RDFHandlerException { + // TODO Auto-generated method stub + + } + + @Override + public void handleNamespace(String prefix, String uri) + throws RDFHandlerException { + // TODO Auto-generated method stub + + } + + @Override + public void handleStatement(Statement st) throws RDFHandlerException { + m_graph.addTriple(st.getSubject(), st.getPredicate(), st.getObject()); + } + + @Override + public void handleComment(String comment) throws RDFHandlerException { + // TODO Auto-generated method stub + + } + +} \ No newline at end of file diff --git a/external/uk/ac/ox/cs/data/sample/RandomWalk.java b/external/uk/ac/ox/cs/data/sample/RandomWalk.java new file mode 100644 index 0000000..d9f5107 --- /dev/null +++ b/external/uk/ac/ox/cs/data/sample/RandomWalk.java @@ -0,0 +1,88 @@ +package uk.ac.ox.cs.data.sample; + +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Random; +import java.util.Set; +import java.util.Stack; + +import org.openrdf.rio.RDFHandlerException; +import org.openrdf.rio.turtle.TurtleWriter; + +import uk.ac.ox.cs.pagoda.util.Utility; + +public class RandomWalk extends Sampler { + + public RandomWalk(RDFGraph graph, TurtleWriter writer) { + super(graph, writer); + } + + protected Random rand = new Random(); + + protected int noOfStatements = 0, statementLimit = 0; + protected Set visited = new HashSet(); + + @Override + public void setLimit(int limit) { + statementLimit = limit; + } + + @Override + public void sample() throws RDFHandlerException { + int u, v, pick, index; + RDFEdge edge; + List edges; + Stack stack = new Stack(); + while (true) { + if (noOfStatements >= statementLimit) { + return ; + } + if (stack.isEmpty()) { + stack.add(v = rand.nextInt(m_graph.numberOfIndividuals)); + Utility.logInfo("A new start: " + m_graph.getRawString(v)); + visit(v); + } + u = stack.peek(); + if (rand.nextInt(100) < 15) { + stack.pop(); + continue; + } + if ((edges = m_graph.edges.get(u)) == null || edges.size() == 0) { + stack.clear(); + continue; + } + + index = 0; + pick = rand.nextInt(edges.size()); + for (Iterator iter = edges.iterator(); iter.hasNext(); ++index) { + edge = iter.next(); + if (index == pick) { + stack.add(v = edge.m_dst); + visit(v); + m_writer.handleStatement(m_graph.getStatement(u, edge.m_label, edge.m_dst)); + ++noOfStatements; + iter.remove(); + } + + } + } + } + + protected void visit(int node) throws RDFHandlerException { + if (visited.contains(node)) return ; + visited.add(node); + List list = m_graph.labels.get(node); + if (list == null) return ; + for (Iterator iter = list.iterator(); iter.hasNext(); ) + m_writer.handleStatement(m_graph.getStatement(node, iter.next())); + noOfStatements += list.size(); + } + + @Override + public void dispose() { + visited.clear(); + } + + +} diff --git a/external/uk/ac/ox/cs/data/sample/RandomWalkMulti.java b/external/uk/ac/ox/cs/data/sample/RandomWalkMulti.java new file mode 100644 index 0000000..592f249 --- /dev/null +++ b/external/uk/ac/ox/cs/data/sample/RandomWalkMulti.java @@ -0,0 +1,112 @@ +package uk.ac.ox.cs.data.sample; + +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.Set; +import java.util.Stack; + +import org.openrdf.rio.RDFHandlerException; +import org.openrdf.rio.turtle.TurtleWriter; + +import uk.ac.ox.cs.pagoda.util.Utility; + + +public class RandomWalkMulti extends RandomWalk { + + public RandomWalkMulti(RDFGraph graph, TurtleWriter writer) { + super(graph, writer); + } + + Queue queue = new LinkedList(); + + @Override + public void sample() throws RDFHandlerException { + getStartNodes(); + + Utility.logInfo(queue.size()); + + int u, v, pick, index; + int individualLimit = statementLimit / queue.size(), currentLimit = 0; + RDFEdge edge; + List edges; + Stack stack = new Stack(); + while (true) { + if (noOfStatements >= statementLimit) { + System.out.println("The number of statements in the sampling: " + noOfStatements); + return ; + } + if (noOfStatements >= currentLimit) { + stack.clear(); + } + + if (stack.isEmpty()) { + if (queue.isEmpty()) + v = rand.nextInt(m_graph.numberOfIndividuals); + else { + v = queue.poll(); + currentLimit += individualLimit; + } + stack.add(v); +// Utility.logInfo(noOfStart + " new start: " + m_graph.getRawString(v)); + visit(v); + } + u = stack.peek(); + if (rand.nextInt(100) < 15) { + stack.pop(); + continue; + } + if ((edges = m_graph.edges.get(u)) == null || edges.size() == 0) { + stack.clear(); + continue; + } + + index = 0; + pick = rand.nextInt(edges.size()); + for (Iterator iter = edges.iterator(); iter.hasNext(); ++index) { + edge = iter.next(); + if (index == pick) { + stack.add(v = edge.m_dst); + visit(v); + m_writer.handleStatement(m_graph.getStatement(u, edge.m_label, edge.m_dst)); + ++noOfStatements; + iter.remove(); + } + + } + } + } + + private void getStartNodes() throws RDFHandlerException { + Set coveredConcepts = new HashSet(); + Integer concept; + + Iterator iter; + for (Map.Entry> entry: m_graph.labels.entrySet()) { + iter = entry.getValue().iterator(); + concept = null; + + while (iter.hasNext()) { + if (!(coveredConcepts.contains(concept = iter.next()))) { + break; + } + else concept = null; + + } + + if (concept == null) continue; + else { + queue.add(entry.getKey()); + coveredConcepts.add(concept); + while (iter.hasNext()) + coveredConcepts.add(iter.next()); + } + } + + } + + +} diff --git a/external/uk/ac/ox/cs/data/sample/Sampler.java b/external/uk/ac/ox/cs/data/sample/Sampler.java new file mode 100644 index 0000000..205b29b --- /dev/null +++ b/external/uk/ac/ox/cs/data/sample/Sampler.java @@ -0,0 +1,23 @@ +package uk.ac.ox.cs.data.sample; + +import org.openrdf.rio.RDFHandlerException; +import org.openrdf.rio.turtle.TurtleWriter; + +public abstract class Sampler { + + protected RDFGraph m_graph; + protected TurtleWriter m_writer; + + public Sampler(RDFGraph graph, TurtleWriter writer) { + m_graph = graph; + m_writer = writer; + } + + public abstract void setLimit(int limit); + + public abstract void sample() throws RDFHandlerException; + + public abstract void dispose(); + + +} -- cgit v1.2.3