diff options
| author | yzhou <yujiao.zhou@gmail.com> | 2015-04-21 10:34:27 +0100 |
|---|---|---|
| committer | yzhou <yujiao.zhou@gmail.com> | 2015-04-21 10:34:27 +0100 |
| commit | 9ce65c5a963b03ee97fe9cb6c5aa65a3c04a80a8 (patch) | |
| tree | 47511c0fb89dccff0db4b5990522e04f294d795b /cpp | |
| parent | b1ac207612ee8b045244253fb94b866104bc34f2 (diff) | |
| download | ACQuA-9ce65c5a963b03ee97fe9cb6c5aa65a3c04a80a8.tar.gz ACQuA-9ce65c5a963b03ee97fe9cb6c5aa65a3c04a80a8.zip | |
initial version
Diffstat (limited to 'cpp')
| -rw-r--r-- | cpp/sample_cpp.cpp | 288 | ||||
| -rw-r--r-- | cpp/sample_cpp_refined.cpp | 300 |
2 files changed, 588 insertions, 0 deletions
diff --git a/cpp/sample_cpp.cpp b/cpp/sample_cpp.cpp new file mode 100644 index 0000000..769364f --- /dev/null +++ b/cpp/sample_cpp.cpp | |||
| @@ -0,0 +1,288 @@ | |||
| 1 | #include<iostream> | ||
| 2 | #include<fstream> | ||
| 3 | #include<sstream> | ||
| 4 | #include<cstdlib> | ||
| 5 | #include<map> | ||
| 6 | #include<string> | ||
| 7 | #include<list> | ||
| 8 | #include<utility> | ||
| 9 | #include<set> | ||
| 10 | #include<stack> | ||
| 11 | #include<time.h> | ||
| 12 | |||
| 13 | using namespace std; | ||
| 14 | |||
| 15 | const string rdftype = "<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>"; | ||
| 16 | const int MAXN = 100000000; | ||
| 17 | |||
| 18 | map<string, int> inverseIndex4Individual; | ||
| 19 | map<string, int> inverseIndex4Predicate; | ||
| 20 | string *indexingIndividual; | ||
| 21 | string *indexingPredicate; | ||
| 22 | int noOfIndividuals(0), noOfPredicates(0), noOfStatements(0); | ||
| 23 | |||
| 24 | list<int> *labels; | ||
| 25 | list<pair<int,int> > *edges; | ||
| 26 | bool *visited; | ||
| 27 | |||
| 28 | int lineNumber = 0; | ||
| 29 | pair<int,int> **edges_array; | ||
| 30 | int *edges_array_size; | ||
| 31 | |||
| 32 | int getID(string s, bool isPredicate) { | ||
| 33 | int ret; | ||
| 34 | if (isPredicate) { | ||
| 35 | if (!(ret = inverseIndex4Predicate[s])) { | ||
| 36 | inverseIndex4Predicate[s] = (ret = ++noOfPredicates); | ||
| 37 | indexingPredicate[ret] = s; | ||
| 38 | cout << "new predicate: " << s << " " << ret << endl; | ||
| 39 | } | ||
| 40 | } | ||
| 41 | else { | ||
| 42 | if (!(ret = inverseIndex4Individual[s])) { | ||
| 43 | inverseIndex4Individual[s] = (ret = ++noOfIndividuals); | ||
| 44 | indexingIndividual[ret] = s; | ||
| 45 | } | ||
| 46 | } | ||
| 47 | return ret; | ||
| 48 | } | ||
| 49 | |||
| 50 | void addTriple(string s, string p, string o) { | ||
| 51 | //cout << s << " " << p << " " << o << endl; | ||
| 52 | if (!p.compare(rdftype)) { | ||
| 53 | int A(getID(o, true)), c(getID(s, false)); | ||
| 54 | labels[c].push_back(A); | ||
| 55 | } | ||
| 56 | else { | ||
| 57 | int r(getID(p, true)), a(getID(s, false)), b(getID(o, false)); | ||
| 58 | //cout << r << " " << a << " " << b << endl; | ||
| 59 | //cout << edges[a].size() << endl; | ||
| 60 | edges[a].push_back(make_pair(r, b)); | ||
| 61 | } | ||
| 62 | ++noOfStatements; | ||
| 63 | if (noOfStatements % 1000000 == 0) { | ||
| 64 | cout <<"No.of statments: " << noOfStatements | ||
| 65 | << ", No.of individuals " << noOfIndividuals | ||
| 66 | << ", No.of predicates " << noOfPredicates << endl; | ||
| 67 | } | ||
| 68 | } | ||
| 69 | |||
| 70 | ofstream output; | ||
| 71 | |||
| 72 | int visit(int id ) { | ||
| 73 | int num = 0; | ||
| 74 | if (!visited[id]) { | ||
| 75 | visited[id] = true; | ||
| 76 | for (list<int>::iterator it = labels[id].begin(); it != labels[id].end(); ++it, ++num) | ||
| 77 | output << indexingIndividual[id] << " " << rdftype << " " << indexingPredicate[*it] << " ." << endl; | ||
| 78 | } | ||
| 79 | return num; | ||
| 80 | } | ||
| 81 | |||
| 82 | void printList(int u, list<pair<int, int> >::iterator first, list<pair<int, int> >::iterator last) { | ||
| 83 | cout << "-------------" << endl; | ||
| 84 | for (; first != last; ++first) | ||
| 85 | cout << u << " " << first->first << " " << first->second << endl; | ||
| 86 | cout << "-------------" << endl; | ||
| 87 | } | ||
| 88 | |||
| 89 | void sample(int percentage) { | ||
| 90 | stringstream ss; | ||
| 91 | ss << "sample_" << percentage << ".nt"; | ||
| 92 | string outputName; | ||
| 93 | ss >> outputName; | ||
| 94 | |||
| 95 | int noOfRestIndividuals = noOfIndividuals; | ||
| 96 | int *restIndividualsArray = new int[noOfIndividuals]; | ||
| 97 | int *number2index = new int[noOfIndividuals + 1]; | ||
| 98 | for (int i = 0, j = 1; i < noOfIndividuals; ++i, ++j) { | ||
| 99 | restIndividualsArray[i] = j; | ||
| 100 | number2index[j] = i; | ||
| 101 | } | ||
| 102 | |||
| 103 | cout << "The sampled data is going to save in " << outputName << endl; | ||
| 104 | cout << "The number of individuals: " << noOfIndividuals << endl; | ||
| 105 | |||
| 106 | output.open(outputName.c_str(), ofstream::out); | ||
| 107 | int limit(noOfStatements * percentage / 100), choosen(0); | ||
| 108 | |||
| 109 | bool flag; | ||
| 110 | stack<int> s; | ||
| 111 | int p_from, p_to; | ||
| 112 | int u, v, individualIndex, t, pick, len; | ||
| 113 | while (true) { | ||
| 114 | if (choosen >= limit) break; | ||
| 115 | |||
| 116 | if (s.empty()) { | ||
| 117 | //cout << noOfIndividuals << " " << noOfRestIndividuals << endl; | ||
| 118 | s.push(u = restIndividualsArray[rand() % noOfRestIndividuals]); | ||
| 119 | choosen += visit(u); | ||
| 120 | //cout << "A new start " << indexing[u] << endl; | ||
| 121 | } | ||
| 122 | u = s.top(); | ||
| 123 | //cout << "The current element is :" << indexing[u] << endl; | ||
| 124 | //if (choosen % 1000000 == 0) | ||
| 125 | //cout << "No of choosen statement: " << choosen << " No of rest Individuals: " << noOfRestIndividuals << endl; | ||
| 126 | |||
| 127 | if (rand() % 100 < 15) { | ||
| 128 | //cout << "back to its father" << endl; | ||
| 129 | s.pop(); | ||
| 130 | continue; | ||
| 131 | } | ||
| 132 | |||
| 133 | //cout << "outgoing edges: " << edges_array_size[u] << endl; | ||
| 134 | if (!edges_array_size[u]) { | ||
| 135 | //cout << "empty node" << endl; | ||
| 136 | while (!s.empty()) s.pop(); | ||
| 137 | t = number2index[u]; | ||
| 138 | if (t == -1) continue; | ||
| 139 | --noOfRestIndividuals; | ||
| 140 | //cout << indexing[u] << " has been removed with current index " << index << " and number " << u << endl; | ||
| 141 | v = restIndividualsArray[noOfRestIndividuals]; | ||
| 142 | restIndividualsArray[t] = v; | ||
| 143 | number2index[v] = t; | ||
| 144 | number2index[u] = -1; | ||
| 145 | continue; | ||
| 146 | } | ||
| 147 | |||
| 148 | pick = rand() % edges_array_size[u]; | ||
| 149 | if (pick < 0) pick += edges_array_size[u]; | ||
| 150 | p_from = edges_array[u][pick].first; | ||
| 151 | p_to = edges_array[u][pick].second; | ||
| 152 | len = --edges_array_size[u]; | ||
| 153 | edges_array[u][pick].first = edges_array[u][len].first; | ||
| 154 | edges_array[u][pick].second = edges_array[u][len].second; | ||
| 155 | s.push(p_to); | ||
| 156 | choosen += visit(p_to) + 1; | ||
| 157 | if (!indexingPredicate[p_from].size()) { | ||
| 158 | cout << indexingIndividual[u] << " " << indexingPredicate[p_from] << " " << indexingIndividual[p_to] << " ." << endl; | ||
| 159 | return ; | ||
| 160 | } | ||
| 161 | output << indexingIndividual[u] << " " << indexingPredicate[p_from] << " " << indexingIndividual[p_to] << " ." << endl; | ||
| 162 | } | ||
| 163 | |||
| 164 | delete[] restIndividualsArray; | ||
| 165 | delete[] number2index; | ||
| 166 | cout << "The number of statements: " << noOfStatements << " (choosen: " << choosen << ")" << endl; | ||
| 167 | cout << "The number of rest Individuals: " << noOfRestIndividuals << endl; | ||
| 168 | output.close(); | ||
| 169 | } | ||
| 170 | |||
| 171 | int main(int argc, char **argv) { | ||
| 172 | if (argc < 3) return 0; | ||
| 173 | cout << "Trying to open the file " << argv[1] << endl; | ||
| 174 | cout << "Get " << argv[2] << "/100 of the file" << endl; | ||
| 175 | |||
| 176 | int startLine(-1); | ||
| 177 | istringstream iss; | ||
| 178 | if (argc > 3) { | ||
| 179 | iss.str(argv[3]); | ||
| 180 | iss >> startLine; | ||
| 181 | iss.clear(); | ||
| 182 | } | ||
| 183 | cout << "Problematic line starts at " << startLine << endl; | ||
| 184 | |||
| 185 | ifstream input; | ||
| 186 | input.open(argv[1]); | ||
| 187 | if (!input) { | ||
| 188 | cout << "No such file" << endl; | ||
| 189 | } | ||
| 190 | cout << "The input file is " << argv[1] << endl; | ||
| 191 | |||
| 192 | int number, index(0); | ||
| 193 | string subject, predicate, object; | ||
| 194 | |||
| 195 | clock_t start = clock(); | ||
| 196 | cout << "Starting constructing the graph..." << endl; | ||
| 197 | indexingIndividual = new string[MAXN]; | ||
| 198 | indexingPredicate = new string[MAXN]; | ||
| 199 | labels = new list<int>[MAXN]; | ||
| 200 | edges = new list<pair<int, int> >[MAXN]; | ||
| 201 | cout << "mem alloc time: " << ((float) clock() - (float) start) / CLOCKS_PER_SEC << " sec" << endl; | ||
| 202 | |||
| 203 | int noOfInvalidLines = 6; | ||
| 204 | int invalidLines[] = { 700761, 703992, 750181, 750183, 750185, 750315}; | ||
| 205 | int point(startLine == -1 ? noOfInvalidLines : 0); | ||
| 206 | for (int i = point; i < noOfInvalidLines; ++i) | ||
| 207 | // 288884361 | ||
| 208 | invalidLines[i] = startLine + (invalidLines[i] - 700761) - 1; | ||
| 209 | |||
| 210 | string line, nextLine; | ||
| 211 | while (!input.eof()) { | ||
| 212 | if (input >> subject >> predicate); else break; | ||
| 213 | getline(input, object); | ||
| 214 | ++lineNumber; | ||
| 215 | //cout << lineNumber << " " << subject << " " << predicate << " \"" << object << "\"" << endl; | ||
| 216 | |||
| 217 | object.erase(0, 1); | ||
| 218 | if (point < noOfInvalidLines && lineNumber == invalidLines[point]) { | ||
| 219 | getline(input, nextLine); | ||
| 220 | ++lineNumber; | ||
| 221 | ++point; | ||
| 222 | //cout << lineNumber << endl << object << endl << " " << nextLine << endl; | ||
| 223 | object.replace(object.size() - 1, 1, nextLine); | ||
| 224 | //cout << object << endl; | ||
| 225 | } | ||
| 226 | |||
| 227 | object.erase(object.size() - 2, 2); | ||
| 228 | |||
| 229 | //cout << lineNumber << " " << subject << " " << predicate << " \"" << object << "\"" << endl; | ||
| 230 | addTriple(subject, predicate, object); | ||
| 231 | //cout << "Triple added!" << endl; | ||
| 232 | } | ||
| 233 | cout << lineNumber << " " << subject << " " << predicate << " " << object << endl; | ||
| 234 | |||
| 235 | input.close(); | ||
| 236 | |||
| 237 | cout << "Graph constructed!" << endl; | ||
| 238 | float constructionTime(((float) clock() - (float) start) / CLOCKS_PER_SEC / 60); | ||
| 239 | cout << "Construction time: " << constructionTime << " min" << endl; | ||
| 240 | //return 0; | ||
| 241 | |||
| 242 | edges_array = new pair<int,int>* [noOfIndividuals + 1]; | ||
| 243 | edges_array_size = new int[noOfIndividuals + 1]; | ||
| 244 | for (int i = 1, j = 0; i <= noOfIndividuals; ++i) { | ||
| 245 | edges_array_size[i] = edges[i].size(); | ||
| 246 | edges_array[i] = new pair<int,int>[edges_array_size[i]]; | ||
| 247 | j = 0; | ||
| 248 | for (list<pair<int,int> >::iterator it = edges[i].begin(); it != edges[i].end(); ++it, ++j) { | ||
| 249 | edges_array[i][j] = *it; | ||
| 250 | } | ||
| 251 | } | ||
| 252 | |||
| 253 | delete[] edges; | ||
| 254 | constructionTime = ((float) clock() - (float) start) / CLOCKS_PER_SEC / 60; | ||
| 255 | cout << "Rearrange done: " << constructionTime << " min" << endl; | ||
| 256 | |||
| 257 | visited = new bool[noOfIndividuals + 1]; | ||
| 258 | for (int i = 1; i < noOfIndividuals + 1; ++i) | ||
| 259 | visited[i] = 0; | ||
| 260 | |||
| 261 | int percentage; | ||
| 262 | |||
| 263 | srand(19900114); | ||
| 264 | if (argc > 2) { | ||
| 265 | iss.str(argv[2]); | ||
| 266 | iss >> percentage; | ||
| 267 | iss.clear(); | ||
| 268 | } | ||
| 269 | else percentage = 1; | ||
| 270 | |||
| 271 | if (percentage == 0) return 0; | ||
| 272 | sample(percentage); | ||
| 273 | |||
| 274 | cout << "Success!" << endl; | ||
| 275 | float totalTime(((float) clock() - (float) start) / CLOCKS_PER_SEC / 60); | ||
| 276 | cout << "Total time: " << totalTime << " (Sample time: " << totalTime - constructionTime << ") min" << endl; | ||
| 277 | |||
| 278 | delete[] indexingIndividual; | ||
| 279 | delete[] visited; | ||
| 280 | delete[] labels; | ||
| 281 | |||
| 282 | for (int i = 1; i <= noOfIndividuals; ++i) | ||
| 283 | delete[] edges_array[i]; | ||
| 284 | |||
| 285 | delete[] edges_array; | ||
| 286 | |||
| 287 | return 0; | ||
| 288 | } | ||
diff --git a/cpp/sample_cpp_refined.cpp b/cpp/sample_cpp_refined.cpp new file mode 100644 index 0000000..f04ed7d --- /dev/null +++ b/cpp/sample_cpp_refined.cpp | |||
| @@ -0,0 +1,300 @@ | |||
| 1 | #include<iostream> | ||
| 2 | #include<fstream> | ||
| 3 | #include<sstream> | ||
| 4 | #include<cstdlib> | ||
| 5 | #include<map> | ||
| 6 | #include<string> | ||
| 7 | #include<list> | ||
| 8 | #include<utility> | ||
| 9 | #include<set> | ||
| 10 | #include<stack> | ||
| 11 | #include<time.h> | ||
| 12 | |||
| 13 | using namespace std; | ||
| 14 | |||
| 15 | const string rdftype = "<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>"; | ||
| 16 | const int MAXN = 100000000; | ||
| 17 | |||
| 18 | map<string, int> inverseIndex4Individual; | ||
| 19 | map<string, int> inverseIndex4Predicate; | ||
| 20 | string *indexingIndividual; | ||
| 21 | string *indexingPredicate; | ||
| 22 | int noOfIndividuals(0), noOfPredicates(0), noOfStatements(0); | ||
| 23 | |||
| 24 | list<int> *labels; | ||
| 25 | list<pair<int,int> > *edges; | ||
| 26 | bool *visited; | ||
| 27 | |||
| 28 | int lineNumber = 0; | ||
| 29 | pair<int,int> **edges_array; | ||
| 30 | int *edges_array_size; | ||
| 31 | string outputName; | ||
| 32 | |||
| 33 | int getID(string s, bool isPredicate) { | ||
| 34 | int ret; | ||
| 35 | if (isPredicate) { | ||
| 36 | if (!(ret = inverseIndex4Predicate[s])) { | ||
| 37 | inverseIndex4Predicate[s] = (ret = ++noOfPredicates); | ||
| 38 | indexingPredicate[ret] = s; | ||
| 39 | cout << "new predicate: " << s << " " << ret << endl; | ||
| 40 | } | ||
| 41 | } | ||
| 42 | else { | ||
| 43 | if (!(ret = inverseIndex4Individual[s])) { | ||
| 44 | inverseIndex4Individual[s] = (ret = ++noOfIndividuals); | ||
| 45 | indexingIndividual[ret] = s; | ||
| 46 | } | ||
| 47 | } | ||
| 48 | return ret; | ||
| 49 | } | ||
| 50 | |||
| 51 | void addTriple(string s, string p, string o) { | ||
| 52 | //cout << s << " " << p << " " << o << endl; | ||
| 53 | if (!p.compare(rdftype)) { | ||
| 54 | int A(getID(o, true)), c(getID(s, false)); | ||
| 55 | labels[c].push_back(A); | ||
| 56 | } | ||
| 57 | else { | ||
| 58 | int r(getID(p, true)), a(getID(s, false)), b(getID(o, false)); | ||
| 59 | //cout << r << " " << a << " " << b << endl; | ||
| 60 | //cout << edges[a].size() << endl; | ||
| 61 | edges[a].push_back(make_pair(r, b)); | ||
| 62 | } | ||
| 63 | ++noOfStatements; | ||
| 64 | if (noOfStatements % 1000000 == 0) { | ||
| 65 | cout <<"No.of statments: " << noOfStatements | ||
| 66 | << ", No.of individuals " << noOfIndividuals | ||
| 67 | << ", No.of predicates " << noOfPredicates << endl; | ||
| 68 | } | ||
| 69 | } | ||
| 70 | |||
| 71 | ofstream output; | ||
| 72 | |||
| 73 | int visit(int id ) { | ||
| 74 | int num = 0; | ||
| 75 | if (!visited[id]) { | ||
| 76 | visited[id] = true; | ||
| 77 | for (list<int>::iterator it = labels[id].begin(); it != labels[id].end(); ++it, ++num) | ||
| 78 | output << indexingIndividual[id] << " " << rdftype << " " << indexingPredicate[*it] << " ." << endl; | ||
| 79 | } | ||
| 80 | return num; | ||
| 81 | } | ||
| 82 | |||
| 83 | void printList(int u, list<pair<int, int> >::iterator first, list<pair<int, int> >::iterator last) { | ||
| 84 | cout << "-------------" << endl; | ||
| 85 | for (; first != last; ++first) | ||
| 86 | cout << u << " " << first->first << " " << first->second << endl; | ||
| 87 | cout << "-------------" << endl; | ||
| 88 | } | ||
| 89 | |||
| 90 | void sample(int percentage) { | ||
| 91 | int noOfRestIndividuals = noOfIndividuals; | ||
| 92 | int *restIndividualsArray = new int[noOfIndividuals]; | ||
| 93 | int *number2index = new int[noOfIndividuals + 1]; | ||
| 94 | for (int i = 0, j = 1; i < noOfIndividuals; ++i, ++j) { | ||
| 95 | restIndividualsArray[i] = j; | ||
| 96 | number2index[j] = i; | ||
| 97 | } | ||
| 98 | |||
| 99 | cout << "The sampled data is going to save in " << outputName << endl; | ||
| 100 | cout << "The number of individuals: " << noOfIndividuals << endl; | ||
| 101 | |||
| 102 | output.open(outputName.c_str(), ofstream::out); | ||
| 103 | int limit(noOfStatements / 100 * percentage), choosen(0); | ||
| 104 | //cout << noOfStatements * percentage << endl; | ||
| 105 | cout << "Percentage " << percentage << " Limit " << limit << endl; | ||
| 106 | |||
| 107 | bool flag; | ||
| 108 | stack<int> s; | ||
| 109 | int p_from, p_to; | ||
| 110 | int u, v, individualIndex, t, pick, len; | ||
| 111 | while (true) { | ||
| 112 | if (choosen >= limit) break; | ||
| 113 | |||
| 114 | if (s.empty()) { | ||
| 115 | //cout << noOfIndividuals << " " << noOfRestIndividuals << endl; | ||
| 116 | s.push(u = restIndividualsArray[rand() % noOfRestIndividuals]); | ||
| 117 | choosen += visit(u); | ||
| 118 | //cout << "A new start " << indexing[u] << endl; | ||
| 119 | } | ||
| 120 | u = s.top(); | ||
| 121 | //cout << "The current element is :" << indexing[u] << endl; | ||
| 122 | //if (choosen % 1000000 == 0) | ||
| 123 | //cout << "No of choosen statement: " << choosen << " No of rest Individuals: " << noOfRestIndividuals << endl; | ||
| 124 | |||
| 125 | if (rand() % 100 < 15) { | ||
| 126 | //cout << "back to its father" << endl; | ||
| 127 | s.pop(); | ||
| 128 | continue; | ||
| 129 | } | ||
| 130 | |||
| 131 | //cout << "outgoing edges: " << edges_array_size[u] << endl; | ||
| 132 | if (!edges_array_size[u]) { | ||
| 133 | //cout << "empty node" << endl; | ||
| 134 | while (!s.empty()) s.pop(); | ||
| 135 | t = number2index[u]; | ||
| 136 | if (t == -1) continue; | ||
| 137 | --noOfRestIndividuals; | ||
| 138 | //cout << indexing[u] << " has been removed with current index " << index << " and number " << u << endl; | ||
| 139 | v = restIndividualsArray[noOfRestIndividuals]; | ||
| 140 | restIndividualsArray[t] = v; | ||
| 141 | number2index[v] = t; | ||
| 142 | number2index[u] = -1; | ||
| 143 | continue; | ||
| 144 | } | ||
| 145 | |||
| 146 | pick = rand() % edges_array_size[u]; | ||
| 147 | if (pick < 0) pick += edges_array_size[u]; | ||
| 148 | p_from = edges_array[u][pick].first; | ||
| 149 | p_to = edges_array[u][pick].second; | ||
| 150 | len = --edges_array_size[u]; | ||
| 151 | edges_array[u][pick].first = edges_array[u][len].first; | ||
| 152 | edges_array[u][pick].second = edges_array[u][len].second; | ||
| 153 | s.push(p_to); | ||
| 154 | choosen += visit(p_to) + 1; | ||
| 155 | if (!indexingPredicate[p_from].size()) { | ||
| 156 | cout << indexingIndividual[u] << " " << indexingPredicate[p_from] << " " << indexingIndividual[p_to] << " ." << endl; | ||
| 157 | return ; | ||
| 158 | } | ||
| 159 | output << indexingIndividual[u] << " " << indexingPredicate[p_from] << " " << indexingIndividual[p_to] << " ." << endl; | ||
| 160 | } | ||
| 161 | |||
| 162 | delete[] restIndividualsArray; | ||
| 163 | delete[] number2index; | ||
| 164 | cout << "The number of statements: " << noOfStatements << " (choosen: " << choosen << ")" << endl; | ||
| 165 | cout << "The number of rest Individuals: " << noOfRestIndividuals << endl; | ||
| 166 | output.close(); | ||
| 167 | } | ||
| 168 | |||
| 169 | int main(int argc, char **argv) { | ||
| 170 | if (argc < 4) { | ||
| 171 | cout << "./sample originalDataFile percentage outputFile" << endl; | ||
| 172 | return 0; | ||
| 173 | } | ||
| 174 | |||
| 175 | if (argc < 3) return 0; | ||
| 176 | cout << "Trying to open the file " << argv[1] << endl; | ||
| 177 | cout << "Get " << argv[2] << "/100 of the file" << endl; | ||
| 178 | |||
| 179 | istringstream iss; | ||
| 180 | iss.str(argv[3]); | ||
| 181 | iss >> outputName; | ||
| 182 | iss.clear(); | ||
| 183 | |||
| 184 | //stringstream ss; | ||
| 185 | //ss << "/media/krr-nas-share/Yujiao/ontologies/bio2rdf/chembl/graph sampling/sample_" << percentage << ".nt"; | ||
| 186 | //string outputName; | ||
| 187 | //ss >> outputName; | ||
| 188 | |||
| 189 | int startLine(-1); | ||
| 190 | if (argc > 4) { | ||
| 191 | iss.str(argv[4]); | ||
| 192 | iss >> startLine; | ||
| 193 | iss.clear(); | ||
| 194 | } | ||
| 195 | cout << "Problematic line starts at " << startLine << endl; | ||
| 196 | |||
| 197 | ifstream input; | ||
| 198 | input.open(argv[1]); | ||
| 199 | if (!input) { | ||
| 200 | cout << "No such file" << endl; | ||
| 201 | } | ||
| 202 | cout << "The input file is " << argv[1] << endl; | ||
| 203 | |||
| 204 | int number, index(0); | ||
| 205 | string subject, predicate, object; | ||
| 206 | |||
| 207 | clock_t start = clock(); | ||
| 208 | cout << "Starting constructing the graph..." << endl; | ||
| 209 | indexingIndividual = new string[MAXN]; | ||
| 210 | indexingPredicate = new string[MAXN]; | ||
| 211 | labels = new list<int>[MAXN]; | ||
| 212 | edges = new list<pair<int, int> >[MAXN]; | ||
| 213 | cout << "mem alloc time: " << ((float) clock() - (float) start) / CLOCKS_PER_SEC << " sec" << endl; | ||
| 214 | |||
| 215 | int noOfInvalidLines = 6; | ||
| 216 | int invalidLines[] = { 700761, 703992, 750181, 750183, 750185, 750315}; | ||
| 217 | int point(startLine == -1 ? noOfInvalidLines : 0); | ||
| 218 | for (int i = point; i < noOfInvalidLines; ++i) | ||
| 219 | // 288884361 | ||
| 220 | invalidLines[i] = startLine + (invalidLines[i] - 700761) - 1; | ||
| 221 | |||
| 222 | string line, nextLine; | ||
| 223 | while (!input.eof()) { | ||
| 224 | if (input >> subject >> predicate); else break; | ||
| 225 | getline(input, object); | ||
| 226 | ++lineNumber; | ||
| 227 | //cout << lineNumber << " " << subject << " " << predicate << " \"" << object << "\"" << endl; | ||
| 228 | |||
| 229 | object.erase(0, 1); | ||
| 230 | if (point < noOfInvalidLines && lineNumber == invalidLines[point]) { | ||
| 231 | getline(input, nextLine); | ||
| 232 | ++lineNumber; | ||
| 233 | ++point; | ||
| 234 | //cout << lineNumber << endl << object << endl << " " << nextLine << endl; | ||
| 235 | object.replace(object.size() - 1, 1, nextLine); | ||
| 236 | //cout << object << endl; | ||
| 237 | } | ||
| 238 | |||
| 239 | object.erase(object.size() - 2, 2); | ||
| 240 | |||
| 241 | //cout << lineNumber << " " << subject << " " << predicate << " \"" << object << "\"" << endl; | ||
| 242 | addTriple(subject, predicate, object); | ||
| 243 | //cout << "Triple added!" << endl; | ||
| 244 | } | ||
| 245 | cout << lineNumber << " " << subject << " " << predicate << " " << object << endl; | ||
| 246 | |||
| 247 | input.close(); | ||
| 248 | |||
| 249 | cout << "Graph constructed!" << endl; | ||
| 250 | float constructionTime(((float) clock() - (float) start) / CLOCKS_PER_SEC / 60); | ||
| 251 | cout << "Construction time: " << constructionTime << " min" << endl; | ||
| 252 | //return 0; | ||
| 253 | |||
| 254 | edges_array = new pair<int,int>* [noOfIndividuals + 1]; | ||
| 255 | edges_array_size = new int[noOfIndividuals + 1]; | ||
| 256 | for (int i = 1, j = 0; i <= noOfIndividuals; ++i) { | ||
| 257 | edges_array_size[i] = edges[i].size(); | ||
| 258 | edges_array[i] = new pair<int,int>[edges_array_size[i]]; | ||
| 259 | j = 0; | ||
| 260 | for (list<pair<int,int> >::iterator it = edges[i].begin(); it != edges[i].end(); ++it, ++j) { | ||
| 261 | edges_array[i][j] = *it; | ||
| 262 | } | ||
| 263 | } | ||
| 264 | |||
| 265 | delete[] edges; | ||
| 266 | constructionTime = ((float) clock() - (float) start) / CLOCKS_PER_SEC / 60; | ||
| 267 | cout << "Rearrange done: " << constructionTime << " min" << endl; | ||
| 268 | |||
| 269 | visited = new bool[noOfIndividuals + 1]; | ||
| 270 | for (int i = 1; i < noOfIndividuals + 1; ++i) | ||
| 271 | visited[i] = 0; | ||
| 272 | |||
| 273 | int percentage; | ||
| 274 | |||
| 275 | srand(19900114); | ||
| 276 | if (argc > 2) { | ||
| 277 | iss.str(argv[2]); | ||
| 278 | iss >> percentage; | ||
| 279 | iss.clear(); | ||
| 280 | } | ||
| 281 | else percentage = 1; | ||
| 282 | |||
| 283 | if (percentage == 0) return 0; | ||
| 284 | sample(percentage); | ||
| 285 | |||
| 286 | cout << "Success!" << endl; | ||
| 287 | float totalTime(((float) clock() - (float) start) / CLOCKS_PER_SEC / 60); | ||
| 288 | cout << "Total time: " << totalTime << " (Sample time: " << totalTime - constructionTime << ") min" << endl; | ||
| 289 | |||
| 290 | delete[] indexingIndividual; | ||
| 291 | delete[] visited; | ||
| 292 | delete[] labels; | ||
| 293 | |||
| 294 | for (int i = 1; i <= noOfIndividuals; ++i) | ||
| 295 | delete[] edges_array[i]; | ||
| 296 | |||
| 297 | delete[] edges_array; | ||
| 298 | |||
| 299 | return 0; | ||
| 300 | } | ||
