aboutsummaryrefslogtreecommitdiff
path: root/external/uk/ac/ox/cs/data/sample/DataSampling.java
diff options
context:
space:
mode:
Diffstat (limited to 'external/uk/ac/ox/cs/data/sample/DataSampling.java')
-rw-r--r--external/uk/ac/ox/cs/data/sample/DataSampling.java320
1 files changed, 320 insertions, 0 deletions
diff --git a/external/uk/ac/ox/cs/data/sample/DataSampling.java b/external/uk/ac/ox/cs/data/sample/DataSampling.java
new file mode 100644
index 0000000..1a788e3
--- /dev/null
+++ b/external/uk/ac/ox/cs/data/sample/DataSampling.java
@@ -0,0 +1,320 @@
1package uk.ac.ox.cs.data.sample;
2
3import java.io.File;
4import java.io.FileInputStream;
5import java.io.FileNotFoundException;
6import java.io.FileOutputStream;
7import java.io.IOException;
8import java.util.*;
9import java.util.Map.Entry;
10
11import org.openrdf.model.Resource;
12import org.openrdf.model.Statement;
13import org.openrdf.model.URI;
14import org.openrdf.model.Value;
15import org.openrdf.model.impl.StatementImpl;
16import org.openrdf.model.impl.URIImpl;
17import org.openrdf.rio.RDFHandler;
18import org.openrdf.rio.RDFHandlerException;
19import org.openrdf.rio.RDFParseException;
20import org.openrdf.rio.RDFParser;
21import org.openrdf.rio.ntriples.NTriplesParser;
22import org.openrdf.rio.turtle.*;
23
24import uk.ac.ox.cs.pagoda.owl.OWLHelper;
25import uk.ac.ox.cs.pagoda.util.Namespace;
26import uk.ac.ox.cs.pagoda.util.Utility;
27
28public class DataSampling {
29
30 File[] m_list;
31 RDFGraph m_graph;
32 double m_percentage;
33 Set<String> excludeEntities = new HashSet<String>();
34
35 public DataSampling(String prefix, String fileName, String excludeFile, double percentage, boolean inTurtle) {
36 if (excludeFile != null) {
37 try {
38 Scanner scanner = new Scanner(new File(excludeFile));
39 while (scanner.hasNextLine())
40 excludeEntities.add(OWLHelper.removeAngles(scanner.nextLine().trim()));
41 scanner.close();
42 } catch (FileNotFoundException e1) {
43 // TODO Auto-generated catch block
44 e1.printStackTrace();
45 }
46 }
47 excludeEntities.add("http://www.w3.org/2002/07/owl#imports");
48
49 File file = new File(fileName);
50 if (file.isDirectory()) m_list = file.listFiles();
51 else m_list = new File[] {file};
52 m_percentage = percentage;
53
54 RDFParser parser = inTurtle ? new TurtleParser() : new NTriplesParser();
55
56 GraphRDFHandler handler = new GraphRDFHandler(excludeEntities);
57 parser.setRDFHandler(handler);
58
59 FileInputStream istream;
60 try {
61 for (File tFile: m_list) {
62 parser.parse(istream = new FileInputStream(tFile), prefix);
63 istream.close();
64 }
65 } catch (IOException e) {
66 e.printStackTrace();
67 } catch (RDFParseException e) {
68 e.printStackTrace();
69 } catch (RDFHandlerException e) {
70 e.printStackTrace();
71 }
72
73 m_graph = handler.getGraph();
74 }
75
76 public void sample(String outputFile, boolean multiStart) {
77 try {
78 FileOutputStream ostream = new FileOutputStream(outputFile);
79 TurtleWriter writer = new TurtleWriter(ostream);
80 writer.startRDF();
81
82 if (m_percentage < 100) {
83 Sampler sam = multiStart ?
84 new RandomWalkMulti(m_graph, writer) :
85 new RandomWalk(m_graph, writer);
86 sam.setLimit((int) (m_graph.numberOfStatement / 100 * m_percentage));
87 System.out.println("Statement limit: " + (m_graph.numberOfStatement / 100 * m_percentage));
88 sam.sample();
89 sam.dispose();
90 }
91 else {
92 m_graph.visit(writer);
93 }
94 writer.endRDF();
95 ostream.close();
96 } catch (IOException e) {
97 e.printStackTrace();
98 } catch (RDFHandlerException e) {
99 // TODO Auto-generated catch block
100 e.printStackTrace();
101 }
102 }
103
104 public static void main(String[] args) {
105 sampleReactome();
106// sampleChEMBL();
107 }
108
109 static void sampleReactome() {
110// double[] ps = {40, 70, 100};
111 double[] ps = {0.25, 0.5, 0.75};
112 for (double per: ps) {
113 DataSampling sampling = new DataSampling(
114 "http://www.biopax.org/release/biopax-level3.owl#",
115// "/home/yzhou/krr-nas-share/Yujiao/ontologies/bio2rdf/reactome/data/data.ttl",
116 "/home/yzhou/krr-nas-share/Yujiao/ontologies/bio2rdf/reactome/data/simplified.nt",
117// "/home/yzhou/krr-nas-share/Yujiao/ontologies/bio2rdf/reactome/graph sampling/reactome_sample_40.ttl",
118 "/home/yzhou/krr-nas-share/Yujiao/ontologies/bio2rdf/reactome/graph sampling/exclude",
119 per,
120 true);
121 sampling.sample("/home/yzhou/krr-nas-share/Yujiao/ontologies/bio2rdf/reactome/graph sampling/sample_test_" + per + ".ttl", true);
122// sampling.sample("/home/yzhou/krr-nas-share/Yujiao/ontologies/bio2rdf/reactome/graph sampling/simplifed_sample_test_" + per + ".ttl", true);
123// sampling.sample("output/sample_reactome_multi.ttl", true);
124 }
125 }
126
127 static void sampleChEMBL() {
128 DataSampling sampling = new DataSampling(
129 "http://rdf.ebi.ac.uk/terms/chembl",
130 "/home/yzhou/RDFdata/ChEMBL/facts/chembl_kbfile.nt",
131 null,
132 100,
133 false);
134
135 sampling.sample("output/sample_chembl_multi.ttl", true);
136 sampling.sample("output/sample_chembl.ttl", false);
137 }
138
139}
140
141class RDFGraph {
142
143 Map<Value, Integer> index = new HashMap<Value, Integer>();
144 Map<Integer, Value> inverseIndex = new HashMap<Integer, Value>();
145 MapToList<Integer> labels = new MapToList<Integer>();
146
147 MapToList<RDFEdge> edges = new MapToList<RDFEdge>();
148 Set<String> excludeEntities;
149
150 int numberOfIndividuals = 0, numberOfProperties = 0;
151
152 public RDFGraph(Set<String> exclude) {
153 excludeEntities = exclude;
154 for (String str: excludeEntities)
155 System.out.println(str);
156 System.out.println("---------------");
157 }
158
159 public void visit(TurtleWriter writer) throws RDFHandlerException {
160 Integer key;
161 for (Entry<Integer, LinkedList<Integer>> entry: labels.entrySet()) {
162 key = entry.getKey();
163 for (Integer type: entry.getValue())
164 writer.handleStatement(getStatement(key, type));
165 }
166
167 for (Entry<Integer, LinkedList<RDFEdge>> entry: edges.entrySet()) {
168 key = entry.getKey();
169 if ((inverseIndex.get(key) instanceof URI) &&
170 ((URI) inverseIndex.get(key)).toString().equals("http://www.reactome.org/biopax/46/879693#UnificationXref9"))
171 System.out.println("Here");
172
173 for (RDFEdge edge: entry.getValue())
174 writer.handleStatement(getStatement(key, edge.m_label, edge.m_dst));
175 }
176 }
177
178 private int getID(Value v, boolean isIndividual) {
179 if (v.toString().contains("imports"))
180 System.out.println(v.toString());
181 if (excludeEntities.contains(v.toString())) {
182 return 0;
183 }
184
185 Integer id = index.get(v);
186 if (id == null)
187 if (isIndividual) {
188 index.put(v, id = ++numberOfIndividuals);
189 inverseIndex.put(id, v);
190 }
191 else {
192 index.put(v, id = --numberOfProperties);
193 inverseIndex.put(id, v);
194 }
195 return id;
196 }
197
198 int numberOfStatement = 0;
199 int counter = 0;
200
201 public void addTriple(Resource s, URI p, Value o) {
202 ++numberOfStatement;
203 if (numberOfStatement % 1000000 == 0) {
204 Utility.logInfo("No.of statements: " + numberOfStatement, "\tNo.of individuals: " + numberOfIndividuals, "\tNo.of predicates: " + (-numberOfProperties));
205 }
206
207 if (p.equals(rdftype)) {
208 int type = getID(o, false), i = getID(s, true);
209 if (i == 0) {
210// System.out.println("<" + s + "> <" + p + "> <" + o + ">");
211 return ;
212 }
213 labels.add(i, type);
214 }
215 else {
216 int i = getID(s, true), j = getID(o, true), prop = getID(p, false) ;
217 if (i == 0 || j == 0 || prop == 0) {
218// System.out.println("<" + s + "> <" + p + "> <" + o + ">");
219 return ;
220 }
221 edges.add(i, new RDFEdge(prop, j));
222 }
223 }
224
225 URI rdftype = new URIImpl(Namespace.RDF_TYPE);
226
227 public Statement getStatement(int... args) {
228 if (args.length == 2)
229 return new StatementImpl((Resource) inverseIndex.get(args[0]), rdftype, (Value) inverseIndex.get(args[1]));
230 else if (args.length == 3)
231 return new StatementImpl((Resource) inverseIndex.get(args[0]), (URI) inverseIndex.get(args[1]), (Value) inverseIndex.get(args[2]));
232 return null;
233 }
234
235 public String getRawString(int id) {
236 return inverseIndex.get(id).toString();
237 }
238
239}
240
241class MapToList<T> {
242
243 private Map<Integer, LinkedList<T>> map = new HashMap<Integer, LinkedList<T>>();
244
245 public void add(int key, T value) {
246 LinkedList<T> list = map.get(key);
247 if (list == null)
248 map.put(key, list = new LinkedList<T>());
249 list.add(value);
250 }
251
252 public Set<Map.Entry<Integer, LinkedList<T>>> entrySet() {
253 return map.entrySet();
254 }
255
256 public void shuffle() {
257 for (List<T> list: map.values())
258 Collections.shuffle(list);
259 }
260
261 public LinkedList<T> get(int key) {
262 return map.get(key);
263 }
264
265}
266
267class RDFEdge {
268
269 int m_label, m_dst;
270
271 public RDFEdge(int label, int dst) {
272 m_label = label;
273 m_dst = dst;
274 }
275
276}
277
278class GraphRDFHandler implements RDFHandler {
279
280 RDFGraph m_graph;
281 Set<String> m_exclude;
282
283 public GraphRDFHandler(Set<String> excludeEntities) {
284 m_exclude = excludeEntities;
285 }
286
287 @Override
288 public void startRDF() throws RDFHandlerException {
289 m_graph = new RDFGraph(m_exclude);
290 }
291
292 public RDFGraph getGraph() {
293 return m_graph;
294 }
295
296 @Override
297 public void endRDF() throws RDFHandlerException {
298 // TODO Auto-generated method stub
299
300 }
301
302 @Override
303 public void handleNamespace(String prefix, String uri)
304 throws RDFHandlerException {
305 // TODO Auto-generated method stub
306
307 }
308
309 @Override
310 public void handleStatement(Statement st) throws RDFHandlerException {
311 m_graph.addTriple(st.getSubject(), st.getPredicate(), st.getObject());
312 }
313
314 @Override
315 public void handleComment(String comment) throws RDFHandlerException {
316 // TODO Auto-generated method stub
317
318 }
319
320} \ No newline at end of file