aboutsummaryrefslogtreecommitdiff
path: root/cpp
diff options
context:
space:
mode:
authoryzhou <yujiao.zhou@gmail.com>2015-04-21 10:34:27 +0100
committeryzhou <yujiao.zhou@gmail.com>2015-04-21 10:34:27 +0100
commit9ce65c5a963b03ee97fe9cb6c5aa65a3c04a80a8 (patch)
tree47511c0fb89dccff0db4b5990522e04f294d795b /cpp
parentb1ac207612ee8b045244253fb94b866104bc34f2 (diff)
downloadACQuA-9ce65c5a963b03ee97fe9cb6c5aa65a3c04a80a8.tar.gz
ACQuA-9ce65c5a963b03ee97fe9cb6c5aa65a3c04a80a8.zip
initial version
Diffstat (limited to 'cpp')
-rw-r--r--cpp/sample_cpp.cpp288
-rw-r--r--cpp/sample_cpp_refined.cpp300
2 files changed, 588 insertions, 0 deletions
diff --git a/cpp/sample_cpp.cpp b/cpp/sample_cpp.cpp
new file mode 100644
index 0000000..769364f
--- /dev/null
+++ b/cpp/sample_cpp.cpp
@@ -0,0 +1,288 @@
1#include<iostream>
2#include<fstream>
3#include<sstream>
4#include<cstdlib>
5#include<map>
6#include<string>
7#include<list>
8#include<utility>
9#include<set>
10#include<stack>
11#include<time.h>
12
13using namespace std;
14
15const string rdftype = "<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>";
16const int MAXN = 100000000;
17
18map<string, int> inverseIndex4Individual;
19map<string, int> inverseIndex4Predicate;
20string *indexingIndividual;
21string *indexingPredicate;
22int noOfIndividuals(0), noOfPredicates(0), noOfStatements(0);
23
24list<int> *labels;
25list<pair<int,int> > *edges;
26bool *visited;
27
28int lineNumber = 0;
29pair<int,int> **edges_array;
30int *edges_array_size;
31
32int getID(string s, bool isPredicate) {
33 int ret;
34 if (isPredicate) {
35 if (!(ret = inverseIndex4Predicate[s])) {
36 inverseIndex4Predicate[s] = (ret = ++noOfPredicates);
37 indexingPredicate[ret] = s;
38 cout << "new predicate: " << s << " " << ret << endl;
39 }
40 }
41 else {
42 if (!(ret = inverseIndex4Individual[s])) {
43 inverseIndex4Individual[s] = (ret = ++noOfIndividuals);
44 indexingIndividual[ret] = s;
45 }
46 }
47 return ret;
48}
49
50void addTriple(string s, string p, string o) {
51 //cout << s << " " << p << " " << o << endl;
52 if (!p.compare(rdftype)) {
53 int A(getID(o, true)), c(getID(s, false));
54 labels[c].push_back(A);
55 }
56 else {
57 int r(getID(p, true)), a(getID(s, false)), b(getID(o, false));
58 //cout << r << " " << a << " " << b << endl;
59 //cout << edges[a].size() << endl;
60 edges[a].push_back(make_pair(r, b));
61 }
62 ++noOfStatements;
63 if (noOfStatements % 1000000 == 0) {
64 cout <<"No.of statments: " << noOfStatements
65 << ", No.of individuals " << noOfIndividuals
66 << ", No.of predicates " << noOfPredicates << endl;
67 }
68}
69
70ofstream output;
71
72int visit(int id ) {
73 int num = 0;
74 if (!visited[id]) {
75 visited[id] = true;
76 for (list<int>::iterator it = labels[id].begin(); it != labels[id].end(); ++it, ++num)
77 output << indexingIndividual[id] << " " << rdftype << " " << indexingPredicate[*it] << " ." << endl;
78 }
79 return num;
80}
81
82void printList(int u, list<pair<int, int> >::iterator first, list<pair<int, int> >::iterator last) {
83 cout << "-------------" << endl;
84 for (; first != last; ++first)
85 cout << u << " " << first->first << " " << first->second << endl;
86 cout << "-------------" << endl;
87}
88
89void sample(int percentage) {
90 stringstream ss;
91 ss << "sample_" << percentage << ".nt";
92 string outputName;
93 ss >> outputName;
94
95 int noOfRestIndividuals = noOfIndividuals;
96 int *restIndividualsArray = new int[noOfIndividuals];
97 int *number2index = new int[noOfIndividuals + 1];
98 for (int i = 0, j = 1; i < noOfIndividuals; ++i, ++j) {
99 restIndividualsArray[i] = j;
100 number2index[j] = i;
101 }
102
103 cout << "The sampled data is going to save in " << outputName << endl;
104 cout << "The number of individuals: " << noOfIndividuals << endl;
105
106 output.open(outputName.c_str(), ofstream::out);
107 int limit(noOfStatements * percentage / 100), choosen(0);
108
109 bool flag;
110 stack<int> s;
111 int p_from, p_to;
112 int u, v, individualIndex, t, pick, len;
113 while (true) {
114 if (choosen >= limit) break;
115
116 if (s.empty()) {
117 //cout << noOfIndividuals << " " << noOfRestIndividuals << endl;
118 s.push(u = restIndividualsArray[rand() % noOfRestIndividuals]);
119 choosen += visit(u);
120 //cout << "A new start " << indexing[u] << endl;
121 }
122 u = s.top();
123 //cout << "The current element is :" << indexing[u] << endl;
124 //if (choosen % 1000000 == 0)
125 //cout << "No of choosen statement: " << choosen << " No of rest Individuals: " << noOfRestIndividuals << endl;
126
127 if (rand() % 100 < 15) {
128 //cout << "back to its father" << endl;
129 s.pop();
130 continue;
131 }
132
133 //cout << "outgoing edges: " << edges_array_size[u] << endl;
134 if (!edges_array_size[u]) {
135 //cout << "empty node" << endl;
136 while (!s.empty()) s.pop();
137 t = number2index[u];
138 if (t == -1) continue;
139 --noOfRestIndividuals;
140 //cout << indexing[u] << " has been removed with current index " << index << " and number " << u << endl;
141 v = restIndividualsArray[noOfRestIndividuals];
142 restIndividualsArray[t] = v;
143 number2index[v] = t;
144 number2index[u] = -1;
145 continue;
146 }
147
148 pick = rand() % edges_array_size[u];
149 if (pick < 0) pick += edges_array_size[u];
150 p_from = edges_array[u][pick].first;
151 p_to = edges_array[u][pick].second;
152 len = --edges_array_size[u];
153 edges_array[u][pick].first = edges_array[u][len].first;
154 edges_array[u][pick].second = edges_array[u][len].second;
155 s.push(p_to);
156 choosen += visit(p_to) + 1;
157 if (!indexingPredicate[p_from].size()) {
158 cout << indexingIndividual[u] << " " << indexingPredicate[p_from] << " " << indexingIndividual[p_to] << " ." << endl;
159 return ;
160 }
161 output << indexingIndividual[u] << " " << indexingPredicate[p_from] << " " << indexingIndividual[p_to] << " ." << endl;
162 }
163
164 delete[] restIndividualsArray;
165 delete[] number2index;
166 cout << "The number of statements: " << noOfStatements << " (choosen: " << choosen << ")" << endl;
167 cout << "The number of rest Individuals: " << noOfRestIndividuals << endl;
168 output.close();
169}
170
171int main(int argc, char **argv) {
172 if (argc < 3) return 0;
173 cout << "Trying to open the file " << argv[1] << endl;
174 cout << "Get " << argv[2] << "/100 of the file" << endl;
175
176 int startLine(-1);
177 istringstream iss;
178 if (argc > 3) {
179 iss.str(argv[3]);
180 iss >> startLine;
181 iss.clear();
182 }
183 cout << "Problematic line starts at " << startLine << endl;
184
185 ifstream input;
186 input.open(argv[1]);
187 if (!input) {
188 cout << "No such file" << endl;
189 }
190 cout << "The input file is " << argv[1] << endl;
191
192 int number, index(0);
193 string subject, predicate, object;
194
195 clock_t start = clock();
196 cout << "Starting constructing the graph..." << endl;
197 indexingIndividual = new string[MAXN];
198 indexingPredicate = new string[MAXN];
199 labels = new list<int>[MAXN];
200 edges = new list<pair<int, int> >[MAXN];
201 cout << "mem alloc time: " << ((float) clock() - (float) start) / CLOCKS_PER_SEC << " sec" << endl;
202
203 int noOfInvalidLines = 6;
204 int invalidLines[] = { 700761, 703992, 750181, 750183, 750185, 750315};
205 int point(startLine == -1 ? noOfInvalidLines : 0);
206 for (int i = point; i < noOfInvalidLines; ++i)
207 // 288884361
208 invalidLines[i] = startLine + (invalidLines[i] - 700761) - 1;
209
210 string line, nextLine;
211 while (!input.eof()) {
212 if (input >> subject >> predicate); else break;
213 getline(input, object);
214 ++lineNumber;
215 //cout << lineNumber << " " << subject << " " << predicate << " \"" << object << "\"" << endl;
216
217 object.erase(0, 1);
218 if (point < noOfInvalidLines && lineNumber == invalidLines[point]) {
219 getline(input, nextLine);
220 ++lineNumber;
221 ++point;
222 //cout << lineNumber << endl << object << endl << " " << nextLine << endl;
223 object.replace(object.size() - 1, 1, nextLine);
224 //cout << object << endl;
225 }
226
227 object.erase(object.size() - 2, 2);
228
229 //cout << lineNumber << " " << subject << " " << predicate << " \"" << object << "\"" << endl;
230 addTriple(subject, predicate, object);
231 //cout << "Triple added!" << endl;
232 }
233 cout << lineNumber << " " << subject << " " << predicate << " " << object << endl;
234
235 input.close();
236
237 cout << "Graph constructed!" << endl;
238 float constructionTime(((float) clock() - (float) start) / CLOCKS_PER_SEC / 60);
239 cout << "Construction time: " << constructionTime << " min" << endl;
240 //return 0;
241
242 edges_array = new pair<int,int>* [noOfIndividuals + 1];
243 edges_array_size = new int[noOfIndividuals + 1];
244 for (int i = 1, j = 0; i <= noOfIndividuals; ++i) {
245 edges_array_size[i] = edges[i].size();
246 edges_array[i] = new pair<int,int>[edges_array_size[i]];
247 j = 0;
248 for (list<pair<int,int> >::iterator it = edges[i].begin(); it != edges[i].end(); ++it, ++j) {
249 edges_array[i][j] = *it;
250 }
251 }
252
253 delete[] edges;
254 constructionTime = ((float) clock() - (float) start) / CLOCKS_PER_SEC / 60;
255 cout << "Rearrange done: " << constructionTime << " min" << endl;
256
257 visited = new bool[noOfIndividuals + 1];
258 for (int i = 1; i < noOfIndividuals + 1; ++i)
259 visited[i] = 0;
260
261 int percentage;
262
263 srand(19900114);
264 if (argc > 2) {
265 iss.str(argv[2]);
266 iss >> percentage;
267 iss.clear();
268 }
269 else percentage = 1;
270
271 if (percentage == 0) return 0;
272 sample(percentage);
273
274 cout << "Success!" << endl;
275 float totalTime(((float) clock() - (float) start) / CLOCKS_PER_SEC / 60);
276 cout << "Total time: " << totalTime << " (Sample time: " << totalTime - constructionTime << ") min" << endl;
277
278 delete[] indexingIndividual;
279 delete[] visited;
280 delete[] labels;
281
282 for (int i = 1; i <= noOfIndividuals; ++i)
283 delete[] edges_array[i];
284
285 delete[] edges_array;
286
287 return 0;
288}
diff --git a/cpp/sample_cpp_refined.cpp b/cpp/sample_cpp_refined.cpp
new file mode 100644
index 0000000..f04ed7d
--- /dev/null
+++ b/cpp/sample_cpp_refined.cpp
@@ -0,0 +1,300 @@
1#include<iostream>
2#include<fstream>
3#include<sstream>
4#include<cstdlib>
5#include<map>
6#include<string>
7#include<list>
8#include<utility>
9#include<set>
10#include<stack>
11#include<time.h>
12
13using namespace std;
14
15const string rdftype = "<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>";
16const int MAXN = 100000000;
17
18map<string, int> inverseIndex4Individual;
19map<string, int> inverseIndex4Predicate;
20string *indexingIndividual;
21string *indexingPredicate;
22int noOfIndividuals(0), noOfPredicates(0), noOfStatements(0);
23
24list<int> *labels;
25list<pair<int,int> > *edges;
26bool *visited;
27
28int lineNumber = 0;
29pair<int,int> **edges_array;
30int *edges_array_size;
31string outputName;
32
33int getID(string s, bool isPredicate) {
34 int ret;
35 if (isPredicate) {
36 if (!(ret = inverseIndex4Predicate[s])) {
37 inverseIndex4Predicate[s] = (ret = ++noOfPredicates);
38 indexingPredicate[ret] = s;
39 cout << "new predicate: " << s << " " << ret << endl;
40 }
41 }
42 else {
43 if (!(ret = inverseIndex4Individual[s])) {
44 inverseIndex4Individual[s] = (ret = ++noOfIndividuals);
45 indexingIndividual[ret] = s;
46 }
47 }
48 return ret;
49}
50
51void addTriple(string s, string p, string o) {
52 //cout << s << " " << p << " " << o << endl;
53 if (!p.compare(rdftype)) {
54 int A(getID(o, true)), c(getID(s, false));
55 labels[c].push_back(A);
56 }
57 else {
58 int r(getID(p, true)), a(getID(s, false)), b(getID(o, false));
59 //cout << r << " " << a << " " << b << endl;
60 //cout << edges[a].size() << endl;
61 edges[a].push_back(make_pair(r, b));
62 }
63 ++noOfStatements;
64 if (noOfStatements % 1000000 == 0) {
65 cout <<"No.of statments: " << noOfStatements
66 << ", No.of individuals " << noOfIndividuals
67 << ", No.of predicates " << noOfPredicates << endl;
68 }
69}
70
71ofstream output;
72
73int visit(int id ) {
74 int num = 0;
75 if (!visited[id]) {
76 visited[id] = true;
77 for (list<int>::iterator it = labels[id].begin(); it != labels[id].end(); ++it, ++num)
78 output << indexingIndividual[id] << " " << rdftype << " " << indexingPredicate[*it] << " ." << endl;
79 }
80 return num;
81}
82
83void printList(int u, list<pair<int, int> >::iterator first, list<pair<int, int> >::iterator last) {
84 cout << "-------------" << endl;
85 for (; first != last; ++first)
86 cout << u << " " << first->first << " " << first->second << endl;
87 cout << "-------------" << endl;
88}
89
90void sample(int percentage) {
91 int noOfRestIndividuals = noOfIndividuals;
92 int *restIndividualsArray = new int[noOfIndividuals];
93 int *number2index = new int[noOfIndividuals + 1];
94 for (int i = 0, j = 1; i < noOfIndividuals; ++i, ++j) {
95 restIndividualsArray[i] = j;
96 number2index[j] = i;
97 }
98
99 cout << "The sampled data is going to save in " << outputName << endl;
100 cout << "The number of individuals: " << noOfIndividuals << endl;
101
102 output.open(outputName.c_str(), ofstream::out);
103 int limit(noOfStatements / 100 * percentage), choosen(0);
104 //cout << noOfStatements * percentage << endl;
105 cout << "Percentage " << percentage << " Limit " << limit << endl;
106
107 bool flag;
108 stack<int> s;
109 int p_from, p_to;
110 int u, v, individualIndex, t, pick, len;
111 while (true) {
112 if (choosen >= limit) break;
113
114 if (s.empty()) {
115 //cout << noOfIndividuals << " " << noOfRestIndividuals << endl;
116 s.push(u = restIndividualsArray[rand() % noOfRestIndividuals]);
117 choosen += visit(u);
118 //cout << "A new start " << indexing[u] << endl;
119 }
120 u = s.top();
121 //cout << "The current element is :" << indexing[u] << endl;
122 //if (choosen % 1000000 == 0)
123 //cout << "No of choosen statement: " << choosen << " No of rest Individuals: " << noOfRestIndividuals << endl;
124
125 if (rand() % 100 < 15) {
126 //cout << "back to its father" << endl;
127 s.pop();
128 continue;
129 }
130
131 //cout << "outgoing edges: " << edges_array_size[u] << endl;
132 if (!edges_array_size[u]) {
133 //cout << "empty node" << endl;
134 while (!s.empty()) s.pop();
135 t = number2index[u];
136 if (t == -1) continue;
137 --noOfRestIndividuals;
138 //cout << indexing[u] << " has been removed with current index " << index << " and number " << u << endl;
139 v = restIndividualsArray[noOfRestIndividuals];
140 restIndividualsArray[t] = v;
141 number2index[v] = t;
142 number2index[u] = -1;
143 continue;
144 }
145
146 pick = rand() % edges_array_size[u];
147 if (pick < 0) pick += edges_array_size[u];
148 p_from = edges_array[u][pick].first;
149 p_to = edges_array[u][pick].second;
150 len = --edges_array_size[u];
151 edges_array[u][pick].first = edges_array[u][len].first;
152 edges_array[u][pick].second = edges_array[u][len].second;
153 s.push(p_to);
154 choosen += visit(p_to) + 1;
155 if (!indexingPredicate[p_from].size()) {
156 cout << indexingIndividual[u] << " " << indexingPredicate[p_from] << " " << indexingIndividual[p_to] << " ." << endl;
157 return ;
158 }
159 output << indexingIndividual[u] << " " << indexingPredicate[p_from] << " " << indexingIndividual[p_to] << " ." << endl;
160 }
161
162 delete[] restIndividualsArray;
163 delete[] number2index;
164 cout << "The number of statements: " << noOfStatements << " (choosen: " << choosen << ")" << endl;
165 cout << "The number of rest Individuals: " << noOfRestIndividuals << endl;
166 output.close();
167}
168
169int main(int argc, char **argv) {
170 if (argc < 4) {
171 cout << "./sample originalDataFile percentage outputFile" << endl;
172 return 0;
173 }
174
175 if (argc < 3) return 0;
176 cout << "Trying to open the file " << argv[1] << endl;
177 cout << "Get " << argv[2] << "/100 of the file" << endl;
178
179 istringstream iss;
180 iss.str(argv[3]);
181 iss >> outputName;
182 iss.clear();
183
184 //stringstream ss;
185 //ss << "/media/krr-nas-share/Yujiao/ontologies/bio2rdf/chembl/graph sampling/sample_" << percentage << ".nt";
186 //string outputName;
187 //ss >> outputName;
188
189 int startLine(-1);
190 if (argc > 4) {
191 iss.str(argv[4]);
192 iss >> startLine;
193 iss.clear();
194 }
195 cout << "Problematic line starts at " << startLine << endl;
196
197 ifstream input;
198 input.open(argv[1]);
199 if (!input) {
200 cout << "No such file" << endl;
201 }
202 cout << "The input file is " << argv[1] << endl;
203
204 int number, index(0);
205 string subject, predicate, object;
206
207 clock_t start = clock();
208 cout << "Starting constructing the graph..." << endl;
209 indexingIndividual = new string[MAXN];
210 indexingPredicate = new string[MAXN];
211 labels = new list<int>[MAXN];
212 edges = new list<pair<int, int> >[MAXN];
213 cout << "mem alloc time: " << ((float) clock() - (float) start) / CLOCKS_PER_SEC << " sec" << endl;
214
215 int noOfInvalidLines = 6;
216 int invalidLines[] = { 700761, 703992, 750181, 750183, 750185, 750315};
217 int point(startLine == -1 ? noOfInvalidLines : 0);
218 for (int i = point; i < noOfInvalidLines; ++i)
219 // 288884361
220 invalidLines[i] = startLine + (invalidLines[i] - 700761) - 1;
221
222 string line, nextLine;
223 while (!input.eof()) {
224 if (input >> subject >> predicate); else break;
225 getline(input, object);
226 ++lineNumber;
227 //cout << lineNumber << " " << subject << " " << predicate << " \"" << object << "\"" << endl;
228
229 object.erase(0, 1);
230 if (point < noOfInvalidLines && lineNumber == invalidLines[point]) {
231 getline(input, nextLine);
232 ++lineNumber;
233 ++point;
234 //cout << lineNumber << endl << object << endl << " " << nextLine << endl;
235 object.replace(object.size() - 1, 1, nextLine);
236 //cout << object << endl;
237 }
238
239 object.erase(object.size() - 2, 2);
240
241 //cout << lineNumber << " " << subject << " " << predicate << " \"" << object << "\"" << endl;
242 addTriple(subject, predicate, object);
243 //cout << "Triple added!" << endl;
244 }
245 cout << lineNumber << " " << subject << " " << predicate << " " << object << endl;
246
247 input.close();
248
249 cout << "Graph constructed!" << endl;
250 float constructionTime(((float) clock() - (float) start) / CLOCKS_PER_SEC / 60);
251 cout << "Construction time: " << constructionTime << " min" << endl;
252 //return 0;
253
254 edges_array = new pair<int,int>* [noOfIndividuals + 1];
255 edges_array_size = new int[noOfIndividuals + 1];
256 for (int i = 1, j = 0; i <= noOfIndividuals; ++i) {
257 edges_array_size[i] = edges[i].size();
258 edges_array[i] = new pair<int,int>[edges_array_size[i]];
259 j = 0;
260 for (list<pair<int,int> >::iterator it = edges[i].begin(); it != edges[i].end(); ++it, ++j) {
261 edges_array[i][j] = *it;
262 }
263 }
264
265 delete[] edges;
266 constructionTime = ((float) clock() - (float) start) / CLOCKS_PER_SEC / 60;
267 cout << "Rearrange done: " << constructionTime << " min" << endl;
268
269 visited = new bool[noOfIndividuals + 1];
270 for (int i = 1; i < noOfIndividuals + 1; ++i)
271 visited[i] = 0;
272
273 int percentage;
274
275 srand(19900114);
276 if (argc > 2) {
277 iss.str(argv[2]);
278 iss >> percentage;
279 iss.clear();
280 }
281 else percentage = 1;
282
283 if (percentage == 0) return 0;
284 sample(percentage);
285
286 cout << "Success!" << endl;
287 float totalTime(((float) clock() - (float) start) / CLOCKS_PER_SEC / 60);
288 cout << "Total time: " << totalTime << " (Sample time: " << totalTime - constructionTime << ") min" << endl;
289
290 delete[] indexingIndividual;
291 delete[] visited;
292 delete[] labels;
293
294 for (int i = 1; i <= noOfIndividuals; ++i)
295 delete[] edges_array[i];
296
297 delete[] edges_array;
298
299 return 0;
300}