aboutsummaryrefslogtreecommitdiff
path: root/external/uk/ac/ox/cs/data/sample
diff options
context:
space:
mode:
authoryzhou <yzhou@krr-linux.cs.ox.ac.uk>2015-04-30 17:36:35 +0100
committeryzhou <yzhou@krr-linux.cs.ox.ac.uk>2015-04-30 17:36:35 +0100
commit0d8f240c9c0a64f2285324e5a517161e45c698fc (patch)
treef4b4f7078e3be02011b9812cd8791c657a135993 /external/uk/ac/ox/cs/data/sample
parent68ae342b2a4923bc7b3f378c6a489f2355d85279 (diff)
downloadACQuA-0d8f240c9c0a64f2285324e5a517161e45c698fc.tar.gz
ACQuA-0d8f240c9c0a64f2285324e5a517161e45c698fc.zip
downgrade owl api and reorganised src files
Diffstat (limited to 'external/uk/ac/ox/cs/data/sample')
-rw-r--r--external/uk/ac/ox/cs/data/sample/DataSampling.java320
-rw-r--r--external/uk/ac/ox/cs/data/sample/RandomWalk.java88
-rw-r--r--external/uk/ac/ox/cs/data/sample/RandomWalkMulti.java112
-rw-r--r--external/uk/ac/ox/cs/data/sample/Sampler.java23
4 files changed, 543 insertions, 0 deletions
diff --git a/external/uk/ac/ox/cs/data/sample/DataSampling.java b/external/uk/ac/ox/cs/data/sample/DataSampling.java
new file mode 100644
index 0000000..1a788e3
--- /dev/null
+++ b/external/uk/ac/ox/cs/data/sample/DataSampling.java
@@ -0,0 +1,320 @@
1package uk.ac.ox.cs.data.sample;
2
3import java.io.File;
4import java.io.FileInputStream;
5import java.io.FileNotFoundException;
6import java.io.FileOutputStream;
7import java.io.IOException;
8import java.util.*;
9import java.util.Map.Entry;
10
11import org.openrdf.model.Resource;
12import org.openrdf.model.Statement;
13import org.openrdf.model.URI;
14import org.openrdf.model.Value;
15import org.openrdf.model.impl.StatementImpl;
16import org.openrdf.model.impl.URIImpl;
17import org.openrdf.rio.RDFHandler;
18import org.openrdf.rio.RDFHandlerException;
19import org.openrdf.rio.RDFParseException;
20import org.openrdf.rio.RDFParser;
21import org.openrdf.rio.ntriples.NTriplesParser;
22import org.openrdf.rio.turtle.*;
23
24import uk.ac.ox.cs.pagoda.owl.OWLHelper;
25import uk.ac.ox.cs.pagoda.util.Namespace;
26import uk.ac.ox.cs.pagoda.util.Utility;
27
28public class DataSampling {
29
30 File[] m_list;
31 RDFGraph m_graph;
32 double m_percentage;
33 Set<String> excludeEntities = new HashSet<String>();
34
35 public DataSampling(String prefix, String fileName, String excludeFile, double percentage, boolean inTurtle) {
36 if (excludeFile != null) {
37 try {
38 Scanner scanner = new Scanner(new File(excludeFile));
39 while (scanner.hasNextLine())
40 excludeEntities.add(OWLHelper.removeAngles(scanner.nextLine().trim()));
41 scanner.close();
42 } catch (FileNotFoundException e1) {
43 // TODO Auto-generated catch block
44 e1.printStackTrace();
45 }
46 }
47 excludeEntities.add("http://www.w3.org/2002/07/owl#imports");
48
49 File file = new File(fileName);
50 if (file.isDirectory()) m_list = file.listFiles();
51 else m_list = new File[] {file};
52 m_percentage = percentage;
53
54 RDFParser parser = inTurtle ? new TurtleParser() : new NTriplesParser();
55
56 GraphRDFHandler handler = new GraphRDFHandler(excludeEntities);
57 parser.setRDFHandler(handler);
58
59 FileInputStream istream;
60 try {
61 for (File tFile: m_list) {
62 parser.parse(istream = new FileInputStream(tFile), prefix);
63 istream.close();
64 }
65 } catch (IOException e) {
66 e.printStackTrace();
67 } catch (RDFParseException e) {
68 e.printStackTrace();
69 } catch (RDFHandlerException e) {
70 e.printStackTrace();
71 }
72
73 m_graph = handler.getGraph();
74 }
75
76 public void sample(String outputFile, boolean multiStart) {
77 try {
78 FileOutputStream ostream = new FileOutputStream(outputFile);
79 TurtleWriter writer = new TurtleWriter(ostream);
80 writer.startRDF();
81
82 if (m_percentage < 100) {
83 Sampler sam = multiStart ?
84 new RandomWalkMulti(m_graph, writer) :
85 new RandomWalk(m_graph, writer);
86 sam.setLimit((int) (m_graph.numberOfStatement / 100 * m_percentage));
87 System.out.println("Statement limit: " + (m_graph.numberOfStatement / 100 * m_percentage));
88 sam.sample();
89 sam.dispose();
90 }
91 else {
92 m_graph.visit(writer);
93 }
94 writer.endRDF();
95 ostream.close();
96 } catch (IOException e) {
97 e.printStackTrace();
98 } catch (RDFHandlerException e) {
99 // TODO Auto-generated catch block
100 e.printStackTrace();
101 }
102 }
103
104 public static void main(String[] args) {
105 sampleReactome();
106// sampleChEMBL();
107 }
108
109 static void sampleReactome() {
110// double[] ps = {40, 70, 100};
111 double[] ps = {0.25, 0.5, 0.75};
112 for (double per: ps) {
113 DataSampling sampling = new DataSampling(
114 "http://www.biopax.org/release/biopax-level3.owl#",
115// "/home/yzhou/krr-nas-share/Yujiao/ontologies/bio2rdf/reactome/data/data.ttl",
116 "/home/yzhou/krr-nas-share/Yujiao/ontologies/bio2rdf/reactome/data/simplified.nt",
117// "/home/yzhou/krr-nas-share/Yujiao/ontologies/bio2rdf/reactome/graph sampling/reactome_sample_40.ttl",
118 "/home/yzhou/krr-nas-share/Yujiao/ontologies/bio2rdf/reactome/graph sampling/exclude",
119 per,
120 true);
121 sampling.sample("/home/yzhou/krr-nas-share/Yujiao/ontologies/bio2rdf/reactome/graph sampling/sample_test_" + per + ".ttl", true);
122// sampling.sample("/home/yzhou/krr-nas-share/Yujiao/ontologies/bio2rdf/reactome/graph sampling/simplifed_sample_test_" + per + ".ttl", true);
123// sampling.sample("output/sample_reactome_multi.ttl", true);
124 }
125 }
126
127 static void sampleChEMBL() {
128 DataSampling sampling = new DataSampling(
129 "http://rdf.ebi.ac.uk/terms/chembl",
130 "/home/yzhou/RDFdata/ChEMBL/facts/chembl_kbfile.nt",
131 null,
132 100,
133 false);
134
135 sampling.sample("output/sample_chembl_multi.ttl", true);
136 sampling.sample("output/sample_chembl.ttl", false);
137 }
138
139}
140
141class RDFGraph {
142
143 Map<Value, Integer> index = new HashMap<Value, Integer>();
144 Map<Integer, Value> inverseIndex = new HashMap<Integer, Value>();
145 MapToList<Integer> labels = new MapToList<Integer>();
146
147 MapToList<RDFEdge> edges = new MapToList<RDFEdge>();
148 Set<String> excludeEntities;
149
150 int numberOfIndividuals = 0, numberOfProperties = 0;
151
152 public RDFGraph(Set<String> exclude) {
153 excludeEntities = exclude;
154 for (String str: excludeEntities)
155 System.out.println(str);
156 System.out.println("---------------");
157 }
158
159 public void visit(TurtleWriter writer) throws RDFHandlerException {
160 Integer key;
161 for (Entry<Integer, LinkedList<Integer>> entry: labels.entrySet()) {
162 key = entry.getKey();
163 for (Integer type: entry.getValue())
164 writer.handleStatement(getStatement(key, type));
165 }
166
167 for (Entry<Integer, LinkedList<RDFEdge>> entry: edges.entrySet()) {
168 key = entry.getKey();
169 if ((inverseIndex.get(key) instanceof URI) &&
170 ((URI) inverseIndex.get(key)).toString().equals("http://www.reactome.org/biopax/46/879693#UnificationXref9"))
171 System.out.println("Here");
172
173 for (RDFEdge edge: entry.getValue())
174 writer.handleStatement(getStatement(key, edge.m_label, edge.m_dst));
175 }
176 }
177
178 private int getID(Value v, boolean isIndividual) {
179 if (v.toString().contains("imports"))
180 System.out.println(v.toString());
181 if (excludeEntities.contains(v.toString())) {
182 return 0;
183 }
184
185 Integer id = index.get(v);
186 if (id == null)
187 if (isIndividual) {
188 index.put(v, id = ++numberOfIndividuals);
189 inverseIndex.put(id, v);
190 }
191 else {
192 index.put(v, id = --numberOfProperties);
193 inverseIndex.put(id, v);
194 }
195 return id;
196 }
197
198 int numberOfStatement = 0;
199 int counter = 0;
200
201 public void addTriple(Resource s, URI p, Value o) {
202 ++numberOfStatement;
203 if (numberOfStatement % 1000000 == 0) {
204 Utility.logInfo("No.of statements: " + numberOfStatement, "\tNo.of individuals: " + numberOfIndividuals, "\tNo.of predicates: " + (-numberOfProperties));
205 }
206
207 if (p.equals(rdftype)) {
208 int type = getID(o, false), i = getID(s, true);
209 if (i == 0) {
210// System.out.println("<" + s + "> <" + p + "> <" + o + ">");
211 return ;
212 }
213 labels.add(i, type);
214 }
215 else {
216 int i = getID(s, true), j = getID(o, true), prop = getID(p, false) ;
217 if (i == 0 || j == 0 || prop == 0) {
218// System.out.println("<" + s + "> <" + p + "> <" + o + ">");
219 return ;
220 }
221 edges.add(i, new RDFEdge(prop, j));
222 }
223 }
224
225 URI rdftype = new URIImpl(Namespace.RDF_TYPE);
226
227 public Statement getStatement(int... args) {
228 if (args.length == 2)
229 return new StatementImpl((Resource) inverseIndex.get(args[0]), rdftype, (Value) inverseIndex.get(args[1]));
230 else if (args.length == 3)
231 return new StatementImpl((Resource) inverseIndex.get(args[0]), (URI) inverseIndex.get(args[1]), (Value) inverseIndex.get(args[2]));
232 return null;
233 }
234
235 public String getRawString(int id) {
236 return inverseIndex.get(id).toString();
237 }
238
239}
240
241class MapToList<T> {
242
243 private Map<Integer, LinkedList<T>> map = new HashMap<Integer, LinkedList<T>>();
244
245 public void add(int key, T value) {
246 LinkedList<T> list = map.get(key);
247 if (list == null)
248 map.put(key, list = new LinkedList<T>());
249 list.add(value);
250 }
251
252 public Set<Map.Entry<Integer, LinkedList<T>>> entrySet() {
253 return map.entrySet();
254 }
255
256 public void shuffle() {
257 for (List<T> list: map.values())
258 Collections.shuffle(list);
259 }
260
261 public LinkedList<T> get(int key) {
262 return map.get(key);
263 }
264
265}
266
267class RDFEdge {
268
269 int m_label, m_dst;
270
271 public RDFEdge(int label, int dst) {
272 m_label = label;
273 m_dst = dst;
274 }
275
276}
277
278class GraphRDFHandler implements RDFHandler {
279
280 RDFGraph m_graph;
281 Set<String> m_exclude;
282
283 public GraphRDFHandler(Set<String> excludeEntities) {
284 m_exclude = excludeEntities;
285 }
286
287 @Override
288 public void startRDF() throws RDFHandlerException {
289 m_graph = new RDFGraph(m_exclude);
290 }
291
292 public RDFGraph getGraph() {
293 return m_graph;
294 }
295
296 @Override
297 public void endRDF() throws RDFHandlerException {
298 // TODO Auto-generated method stub
299
300 }
301
302 @Override
303 public void handleNamespace(String prefix, String uri)
304 throws RDFHandlerException {
305 // TODO Auto-generated method stub
306
307 }
308
309 @Override
310 public void handleStatement(Statement st) throws RDFHandlerException {
311 m_graph.addTriple(st.getSubject(), st.getPredicate(), st.getObject());
312 }
313
314 @Override
315 public void handleComment(String comment) throws RDFHandlerException {
316 // TODO Auto-generated method stub
317
318 }
319
320} \ No newline at end of file
diff --git a/external/uk/ac/ox/cs/data/sample/RandomWalk.java b/external/uk/ac/ox/cs/data/sample/RandomWalk.java
new file mode 100644
index 0000000..d9f5107
--- /dev/null
+++ b/external/uk/ac/ox/cs/data/sample/RandomWalk.java
@@ -0,0 +1,88 @@
1package uk.ac.ox.cs.data.sample;
2
3import java.util.HashSet;
4import java.util.Iterator;
5import java.util.List;
6import java.util.Random;
7import java.util.Set;
8import java.util.Stack;
9
10import org.openrdf.rio.RDFHandlerException;
11import org.openrdf.rio.turtle.TurtleWriter;
12
13import uk.ac.ox.cs.pagoda.util.Utility;
14
15public class RandomWalk extends Sampler {
16
17 public RandomWalk(RDFGraph graph, TurtleWriter writer) {
18 super(graph, writer);
19 }
20
21 protected Random rand = new Random();
22
23 protected int noOfStatements = 0, statementLimit = 0;
24 protected Set<Integer> visited = new HashSet<Integer>();
25
26 @Override
27 public void setLimit(int limit) {
28 statementLimit = limit;
29 }
30
31 @Override
32 public void sample() throws RDFHandlerException {
33 int u, v, pick, index;
34 RDFEdge edge;
35 List<RDFEdge> edges;
36 Stack<Integer> stack = new Stack<Integer>();
37 while (true) {
38 if (noOfStatements >= statementLimit) {
39 return ;
40 }
41 if (stack.isEmpty()) {
42 stack.add(v = rand.nextInt(m_graph.numberOfIndividuals));
43 Utility.logInfo("A new start: " + m_graph.getRawString(v));
44 visit(v);
45 }
46 u = stack.peek();
47 if (rand.nextInt(100) < 15) {
48 stack.pop();
49 continue;
50 }
51 if ((edges = m_graph.edges.get(u)) == null || edges.size() == 0) {
52 stack.clear();
53 continue;
54 }
55
56 index = 0;
57 pick = rand.nextInt(edges.size());
58 for (Iterator<RDFEdge> iter = edges.iterator(); iter.hasNext(); ++index) {
59 edge = iter.next();
60 if (index == pick) {
61 stack.add(v = edge.m_dst);
62 visit(v);
63 m_writer.handleStatement(m_graph.getStatement(u, edge.m_label, edge.m_dst));
64 ++noOfStatements;
65 iter.remove();
66 }
67
68 }
69 }
70 }
71
72 protected void visit(int node) throws RDFHandlerException {
73 if (visited.contains(node)) return ;
74 visited.add(node);
75 List<Integer> list = m_graph.labels.get(node);
76 if (list == null) return ;
77 for (Iterator<Integer> iter = list.iterator(); iter.hasNext(); )
78 m_writer.handleStatement(m_graph.getStatement(node, iter.next()));
79 noOfStatements += list.size();
80 }
81
82 @Override
83 public void dispose() {
84 visited.clear();
85 }
86
87
88}
diff --git a/external/uk/ac/ox/cs/data/sample/RandomWalkMulti.java b/external/uk/ac/ox/cs/data/sample/RandomWalkMulti.java
new file mode 100644
index 0000000..592f249
--- /dev/null
+++ b/external/uk/ac/ox/cs/data/sample/RandomWalkMulti.java
@@ -0,0 +1,112 @@
1package uk.ac.ox.cs.data.sample;
2
3import java.util.HashSet;
4import java.util.Iterator;
5import java.util.LinkedList;
6import java.util.List;
7import java.util.Map;
8import java.util.Queue;
9import java.util.Set;
10import java.util.Stack;
11
12import org.openrdf.rio.RDFHandlerException;
13import org.openrdf.rio.turtle.TurtleWriter;
14
15import uk.ac.ox.cs.pagoda.util.Utility;
16
17
18public class RandomWalkMulti extends RandomWalk {
19
20 public RandomWalkMulti(RDFGraph graph, TurtleWriter writer) {
21 super(graph, writer);
22 }
23
24 Queue<Integer> queue = new LinkedList<Integer>();
25
26 @Override
27 public void sample() throws RDFHandlerException {
28 getStartNodes();
29
30 Utility.logInfo(queue.size());
31
32 int u, v, pick, index;
33 int individualLimit = statementLimit / queue.size(), currentLimit = 0;
34 RDFEdge edge;
35 List<RDFEdge> edges;
36 Stack<Integer> stack = new Stack<Integer>();
37 while (true) {
38 if (noOfStatements >= statementLimit) {
39 System.out.println("The number of statements in the sampling: " + noOfStatements);
40 return ;
41 }
42 if (noOfStatements >= currentLimit) {
43 stack.clear();
44 }
45
46 if (stack.isEmpty()) {
47 if (queue.isEmpty())
48 v = rand.nextInt(m_graph.numberOfIndividuals);
49 else {
50 v = queue.poll();
51 currentLimit += individualLimit;
52 }
53 stack.add(v);
54// Utility.logInfo(noOfStart + " new start: " + m_graph.getRawString(v));
55 visit(v);
56 }
57 u = stack.peek();
58 if (rand.nextInt(100) < 15) {
59 stack.pop();
60 continue;
61 }
62 if ((edges = m_graph.edges.get(u)) == null || edges.size() == 0) {
63 stack.clear();
64 continue;
65 }
66
67 index = 0;
68 pick = rand.nextInt(edges.size());
69 for (Iterator<RDFEdge> iter = edges.iterator(); iter.hasNext(); ++index) {
70 edge = iter.next();
71 if (index == pick) {
72 stack.add(v = edge.m_dst);
73 visit(v);
74 m_writer.handleStatement(m_graph.getStatement(u, edge.m_label, edge.m_dst));
75 ++noOfStatements;
76 iter.remove();
77 }
78
79 }
80 }
81 }
82
83 private void getStartNodes() throws RDFHandlerException {
84 Set<Integer> coveredConcepts = new HashSet<Integer>();
85 Integer concept;
86
87 Iterator<Integer> iter;
88 for (Map.Entry<Integer, LinkedList<Integer>> entry: m_graph.labels.entrySet()) {
89 iter = entry.getValue().iterator();
90 concept = null;
91
92 while (iter.hasNext()) {
93 if (!(coveredConcepts.contains(concept = iter.next()))) {
94 break;
95 }
96 else concept = null;
97
98 }
99
100 if (concept == null) continue;
101 else {
102 queue.add(entry.getKey());
103 coveredConcepts.add(concept);
104 while (iter.hasNext())
105 coveredConcepts.add(iter.next());
106 }
107 }
108
109 }
110
111
112}
diff --git a/external/uk/ac/ox/cs/data/sample/Sampler.java b/external/uk/ac/ox/cs/data/sample/Sampler.java
new file mode 100644
index 0000000..205b29b
--- /dev/null
+++ b/external/uk/ac/ox/cs/data/sample/Sampler.java
@@ -0,0 +1,23 @@
1package uk.ac.ox.cs.data.sample;
2
3import org.openrdf.rio.RDFHandlerException;
4import org.openrdf.rio.turtle.TurtleWriter;
5
6public abstract class Sampler {
7
8 protected RDFGraph m_graph;
9 protected TurtleWriter m_writer;
10
11 public Sampler(RDFGraph graph, TurtleWriter writer) {
12 m_graph = graph;
13 m_writer = writer;
14 }
15
16 public abstract void setLimit(int limit);
17
18 public abstract void sample() throws RDFHandlerException;
19
20 public abstract void dispose();
21
22
23}