aboutsummaryrefslogtreecommitdiff
path: root/test/uk/ac/ox/cs/data/dbpedia/Normaliser.java
diff options
context:
space:
mode:
Diffstat (limited to 'test/uk/ac/ox/cs/data/dbpedia/Normaliser.java')
-rw-r--r--test/uk/ac/ox/cs/data/dbpedia/Normaliser.java155
1 files changed, 0 insertions, 155 deletions
diff --git a/test/uk/ac/ox/cs/data/dbpedia/Normaliser.java b/test/uk/ac/ox/cs/data/dbpedia/Normaliser.java
deleted file mode 100644
index e025604..0000000
--- a/test/uk/ac/ox/cs/data/dbpedia/Normaliser.java
+++ /dev/null
@@ -1,155 +0,0 @@
1package uk.ac.ox.cs.data.dbpedia;
2
3import java.io.BufferedReader;
4import java.io.BufferedWriter;
5import java.io.FileInputStream;
6import java.io.FileOutputStream;
7import java.io.IOException;
8import java.io.InputStreamReader;
9import java.io.OutputStreamWriter;
10import java.text.Normalizer;
11import java.util.HashMap;
12import java.util.HashSet;
13import java.util.Map;
14import java.util.Set;
15import java.util.regex.Pattern;
16
17public class Normaliser {
18
19 public static void main(String[] args) throws IOException {
20 if (args.length == 0) {
21 args = new String[] {
22 "/home/yzhou/ontologies/npd/npd-data-dump-minus-datatype.ttl",
23 "1"
24 };
25 }
26
27 BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(args[0])));
28 String fragment = args[0];
29 int size = Integer.valueOf(args[1]), index;
30
31 if ((index = fragment.lastIndexOf(".")) != -1) {
32 fragment = fragment.substring(0, index) + "_new_fragment" + args[1] + fragment.substring(index);
33 }
34 else fragment += "_fragment" + args[1];
35
36 BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fragment)));
37
38// simpleProcess(reader, writer, size);
39 process(reader, writer, size);
40
41 writer.close();
42 reader.close();
43 }
44
45 public static void simpleProcess(BufferedReader reader, BufferedWriter writer, int size) throws IOException {
46 String line;
47 int index = 0;
48 while ((line = reader.readLine()) != null) {
49 if (++index == size) {
50 index = 0;
51 writer.write(line);
52 writer.newLine();
53 }
54 }
55 }
56
57 static final String illegalSymbols = ",()'‘";
58 static final String[][] replacedSymbols = new String[][] {
59 {"æ", "ae"},
60 {"ø", "o"},
61 {"ß", "t"},
62 {"Ł", "L"},
63 {"ı", "i"},
64 {"ł", "l"},
65 {"–", "-"},
66 {"&", "and"},
67 {"ð", "o"},
68 {"ə", "e"},
69 {"Đ", "D"},
70 {"ħ", "h"},
71// {"%60", "_"},
72 {"đ", "d"},
73 {"Þ", "P"}
74 };
75
76 static Set<Character> symbols2remove;
77 static Map<Character, String> symbols2replace;
78
79 static {
80 symbols2remove = new HashSet<Character>();
81 for (int i = 0; i < illegalSymbols.length(); ++i)
82 symbols2remove.add(illegalSymbols.charAt(i));
83
84 symbols2replace = new HashMap<Character, String>();
85 for (int i = 0; i < replacedSymbols.length; ++i)
86 symbols2replace.put(replacedSymbols[i][0].charAt(0), replacedSymbols[i][1]);
87 }
88
89 static final String urlSymbols = "http://";
90 static final int urlSymbolLength = 7;
91
92 public static void process(BufferedReader reader, BufferedWriter writer, int size) throws IOException {
93 int index = 0;
94 String line;
95
96 String newLine;
97 while ((line = reader.readLine()) != null) {
98 if (line.contains("@"))
99 continue;
100
101 if (++index == size) {
102 newLine = process(line);
103 writer.write(deAccent(newLine.toString()));
104 writer.write('.');
105 writer.newLine();
106 index = 0;
107 }
108 }
109
110 writer.close();
111 reader.close();
112 }
113
114 private static String process(String line) {
115 line = line.replace("%60", "_");//.replace("__", "_");
116
117 int inURL = 0;
118 char ch;
119 String str;
120 StringBuilder newLine = new StringBuilder();
121 for (int i = 0; i < line.length(); ++i) {
122 ch = line.charAt(i);
123
124 if (ch == '.') {
125 if (inURL == urlSymbolLength)
126 newLine.append('.');
127 continue;
128 }
129
130 if (inURL == urlSymbolLength) {
131 if (ch == '/' || ch == '#' || ch == ')' || ch == '>') inURL = 0;
132 }
133 else if (ch == urlSymbols.charAt(inURL)) {
134 ++inURL;
135 }
136 else inURL = 0;
137
138 if ((str = symbols2replace.get(ch)) != null)
139 newLine.append(str);
140 else if (!symbols2remove.contains(ch))
141 newLine.append(ch);
142 }
143
144 return newLine.toString();
145 }
146
147 public static String deAccent(String str) {
148 String nfdNormalizedString = Normalizer.normalize(str, Normalizer.Form.NFD);
149 Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+");
150 String t = pattern.matcher(nfdNormalizedString).replaceAll("");
151 return t;
152 }
153
154
155}