aboutsummaryrefslogtreecommitdiff
path: root/cpp/sample_cpp_refined.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'cpp/sample_cpp_refined.cpp')
-rw-r--r--cpp/sample_cpp_refined.cpp300
1 files changed, 300 insertions, 0 deletions
diff --git a/cpp/sample_cpp_refined.cpp b/cpp/sample_cpp_refined.cpp
new file mode 100644
index 0000000..f04ed7d
--- /dev/null
+++ b/cpp/sample_cpp_refined.cpp
@@ -0,0 +1,300 @@
1#include<iostream>
2#include<fstream>
3#include<sstream>
4#include<cstdlib>
5#include<map>
6#include<string>
7#include<list>
8#include<utility>
9#include<set>
10#include<stack>
11#include<time.h>
12
13using namespace std;
14
15const string rdftype = "<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>";
16const int MAXN = 100000000;
17
18map<string, int> inverseIndex4Individual;
19map<string, int> inverseIndex4Predicate;
20string *indexingIndividual;
21string *indexingPredicate;
22int noOfIndividuals(0), noOfPredicates(0), noOfStatements(0);
23
24list<int> *labels;
25list<pair<int,int> > *edges;
26bool *visited;
27
28int lineNumber = 0;
29pair<int,int> **edges_array;
30int *edges_array_size;
31string outputName;
32
33int getID(string s, bool isPredicate) {
34 int ret;
35 if (isPredicate) {
36 if (!(ret = inverseIndex4Predicate[s])) {
37 inverseIndex4Predicate[s] = (ret = ++noOfPredicates);
38 indexingPredicate[ret] = s;
39 cout << "new predicate: " << s << " " << ret << endl;
40 }
41 }
42 else {
43 if (!(ret = inverseIndex4Individual[s])) {
44 inverseIndex4Individual[s] = (ret = ++noOfIndividuals);
45 indexingIndividual[ret] = s;
46 }
47 }
48 return ret;
49}
50
51void addTriple(string s, string p, string o) {
52 //cout << s << " " << p << " " << o << endl;
53 if (!p.compare(rdftype)) {
54 int A(getID(o, true)), c(getID(s, false));
55 labels[c].push_back(A);
56 }
57 else {
58 int r(getID(p, true)), a(getID(s, false)), b(getID(o, false));
59 //cout << r << " " << a << " " << b << endl;
60 //cout << edges[a].size() << endl;
61 edges[a].push_back(make_pair(r, b));
62 }
63 ++noOfStatements;
64 if (noOfStatements % 1000000 == 0) {
65 cout <<"No.of statments: " << noOfStatements
66 << ", No.of individuals " << noOfIndividuals
67 << ", No.of predicates " << noOfPredicates << endl;
68 }
69}
70
71ofstream output;
72
73int visit(int id ) {
74 int num = 0;
75 if (!visited[id]) {
76 visited[id] = true;
77 for (list<int>::iterator it = labels[id].begin(); it != labels[id].end(); ++it, ++num)
78 output << indexingIndividual[id] << " " << rdftype << " " << indexingPredicate[*it] << " ." << endl;
79 }
80 return num;
81}
82
83void printList(int u, list<pair<int, int> >::iterator first, list<pair<int, int> >::iterator last) {
84 cout << "-------------" << endl;
85 for (; first != last; ++first)
86 cout << u << " " << first->first << " " << first->second << endl;
87 cout << "-------------" << endl;
88}
89
90void sample(int percentage) {
91 int noOfRestIndividuals = noOfIndividuals;
92 int *restIndividualsArray = new int[noOfIndividuals];
93 int *number2index = new int[noOfIndividuals + 1];
94 for (int i = 0, j = 1; i < noOfIndividuals; ++i, ++j) {
95 restIndividualsArray[i] = j;
96 number2index[j] = i;
97 }
98
99 cout << "The sampled data is going to save in " << outputName << endl;
100 cout << "The number of individuals: " << noOfIndividuals << endl;
101
102 output.open(outputName.c_str(), ofstream::out);
103 int limit(noOfStatements / 100 * percentage), choosen(0);
104 //cout << noOfStatements * percentage << endl;
105 cout << "Percentage " << percentage << " Limit " << limit << endl;
106
107 bool flag;
108 stack<int> s;
109 int p_from, p_to;
110 int u, v, individualIndex, t, pick, len;
111 while (true) {
112 if (choosen >= limit) break;
113
114 if (s.empty()) {
115 //cout << noOfIndividuals << " " << noOfRestIndividuals << endl;
116 s.push(u = restIndividualsArray[rand() % noOfRestIndividuals]);
117 choosen += visit(u);
118 //cout << "A new start " << indexing[u] << endl;
119 }
120 u = s.top();
121 //cout << "The current element is :" << indexing[u] << endl;
122 //if (choosen % 1000000 == 0)
123 //cout << "No of choosen statement: " << choosen << " No of rest Individuals: " << noOfRestIndividuals << endl;
124
125 if (rand() % 100 < 15) {
126 //cout << "back to its father" << endl;
127 s.pop();
128 continue;
129 }
130
131 //cout << "outgoing edges: " << edges_array_size[u] << endl;
132 if (!edges_array_size[u]) {
133 //cout << "empty node" << endl;
134 while (!s.empty()) s.pop();
135 t = number2index[u];
136 if (t == -1) continue;
137 --noOfRestIndividuals;
138 //cout << indexing[u] << " has been removed with current index " << index << " and number " << u << endl;
139 v = restIndividualsArray[noOfRestIndividuals];
140 restIndividualsArray[t] = v;
141 number2index[v] = t;
142 number2index[u] = -1;
143 continue;
144 }
145
146 pick = rand() % edges_array_size[u];
147 if (pick < 0) pick += edges_array_size[u];
148 p_from = edges_array[u][pick].first;
149 p_to = edges_array[u][pick].second;
150 len = --edges_array_size[u];
151 edges_array[u][pick].first = edges_array[u][len].first;
152 edges_array[u][pick].second = edges_array[u][len].second;
153 s.push(p_to);
154 choosen += visit(p_to) + 1;
155 if (!indexingPredicate[p_from].size()) {
156 cout << indexingIndividual[u] << " " << indexingPredicate[p_from] << " " << indexingIndividual[p_to] << " ." << endl;
157 return ;
158 }
159 output << indexingIndividual[u] << " " << indexingPredicate[p_from] << " " << indexingIndividual[p_to] << " ." << endl;
160 }
161
162 delete[] restIndividualsArray;
163 delete[] number2index;
164 cout << "The number of statements: " << noOfStatements << " (choosen: " << choosen << ")" << endl;
165 cout << "The number of rest Individuals: " << noOfRestIndividuals << endl;
166 output.close();
167}
168
169int main(int argc, char **argv) {
170 if (argc < 4) {
171 cout << "./sample originalDataFile percentage outputFile" << endl;
172 return 0;
173 }
174
175 if (argc < 3) return 0;
176 cout << "Trying to open the file " << argv[1] << endl;
177 cout << "Get " << argv[2] << "/100 of the file" << endl;
178
179 istringstream iss;
180 iss.str(argv[3]);
181 iss >> outputName;
182 iss.clear();
183
184 //stringstream ss;
185 //ss << "/media/krr-nas-share/Yujiao/ontologies/bio2rdf/chembl/graph sampling/sample_" << percentage << ".nt";
186 //string outputName;
187 //ss >> outputName;
188
189 int startLine(-1);
190 if (argc > 4) {
191 iss.str(argv[4]);
192 iss >> startLine;
193 iss.clear();
194 }
195 cout << "Problematic line starts at " << startLine << endl;
196
197 ifstream input;
198 input.open(argv[1]);
199 if (!input) {
200 cout << "No such file" << endl;
201 }
202 cout << "The input file is " << argv[1] << endl;
203
204 int number, index(0);
205 string subject, predicate, object;
206
207 clock_t start = clock();
208 cout << "Starting constructing the graph..." << endl;
209 indexingIndividual = new string[MAXN];
210 indexingPredicate = new string[MAXN];
211 labels = new list<int>[MAXN];
212 edges = new list<pair<int, int> >[MAXN];
213 cout << "mem alloc time: " << ((float) clock() - (float) start) / CLOCKS_PER_SEC << " sec" << endl;
214
215 int noOfInvalidLines = 6;
216 int invalidLines[] = { 700761, 703992, 750181, 750183, 750185, 750315};
217 int point(startLine == -1 ? noOfInvalidLines : 0);
218 for (int i = point; i < noOfInvalidLines; ++i)
219 // 288884361
220 invalidLines[i] = startLine + (invalidLines[i] - 700761) - 1;
221
222 string line, nextLine;
223 while (!input.eof()) {
224 if (input >> subject >> predicate); else break;
225 getline(input, object);
226 ++lineNumber;
227 //cout << lineNumber << " " << subject << " " << predicate << " \"" << object << "\"" << endl;
228
229 object.erase(0, 1);
230 if (point < noOfInvalidLines && lineNumber == invalidLines[point]) {
231 getline(input, nextLine);
232 ++lineNumber;
233 ++point;
234 //cout << lineNumber << endl << object << endl << " " << nextLine << endl;
235 object.replace(object.size() - 1, 1, nextLine);
236 //cout << object << endl;
237 }
238
239 object.erase(object.size() - 2, 2);
240
241 //cout << lineNumber << " " << subject << " " << predicate << " \"" << object << "\"" << endl;
242 addTriple(subject, predicate, object);
243 //cout << "Triple added!" << endl;
244 }
245 cout << lineNumber << " " << subject << " " << predicate << " " << object << endl;
246
247 input.close();
248
249 cout << "Graph constructed!" << endl;
250 float constructionTime(((float) clock() - (float) start) / CLOCKS_PER_SEC / 60);
251 cout << "Construction time: " << constructionTime << " min" << endl;
252 //return 0;
253
254 edges_array = new pair<int,int>* [noOfIndividuals + 1];
255 edges_array_size = new int[noOfIndividuals + 1];
256 for (int i = 1, j = 0; i <= noOfIndividuals; ++i) {
257 edges_array_size[i] = edges[i].size();
258 edges_array[i] = new pair<int,int>[edges_array_size[i]];
259 j = 0;
260 for (list<pair<int,int> >::iterator it = edges[i].begin(); it != edges[i].end(); ++it, ++j) {
261 edges_array[i][j] = *it;
262 }
263 }
264
265 delete[] edges;
266 constructionTime = ((float) clock() - (float) start) / CLOCKS_PER_SEC / 60;
267 cout << "Rearrange done: " << constructionTime << " min" << endl;
268
269 visited = new bool[noOfIndividuals + 1];
270 for (int i = 1; i < noOfIndividuals + 1; ++i)
271 visited[i] = 0;
272
273 int percentage;
274
275 srand(19900114);
276 if (argc > 2) {
277 iss.str(argv[2]);
278 iss >> percentage;
279 iss.clear();
280 }
281 else percentage = 1;
282
283 if (percentage == 0) return 0;
284 sample(percentage);
285
286 cout << "Success!" << endl;
287 float totalTime(((float) clock() - (float) start) / CLOCKS_PER_SEC / 60);
288 cout << "Total time: " << totalTime << " (Sample time: " << totalTime - constructionTime << ") min" << endl;
289
290 delete[] indexingIndividual;
291 delete[] visited;
292 delete[] labels;
293
294 for (int i = 1; i <= noOfIndividuals; ++i)
295 delete[] edges_array[i];
296
297 delete[] edges_array;
298
299 return 0;
300}