1 files changed, 0 insertions, 155 deletions
diff --git a/test/uk/ac/ox/cs/data/dbpedia/Normaliser.java b/test/uk/ac/ox/cs/data/dbpedia/Normaliser.java
deleted file mode 100644
index e025604..0000000
--- a/test/uk/ac/ox/cs/data/dbpedia/Normaliser.java
+++ /dev/null
@@ -1,155 +0,0 @@
-package uk.ac.ox.cs.data.dbpedia;
-import java.io.BufferedReader;
-import java.io.BufferedWriter;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.OutputStreamWriter;
-import java.text.Normalizer;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
-import java.util.regex.Pattern;
-public class Normaliser {
-        
-        public static void main(String[] args) throws IOException {
-                if (args.length == 0) {
-                        args = new String[] { 
-                                        "/home/yzhou/ontologies/npd/npd-data-dump-minus-datatype.ttl", 
-                                        "1"
-                        }; 
-                }
-                
-                BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(args[0])));
-                String fragment = args[0];
-                int size = Integer.valueOf(args[1]), index;
-                
-                if ((index = fragment.lastIndexOf(".")) != -1) {
-                        fragment = fragment.substring(0, index) + "_new_fragment" + args[1] + fragment.substring(index); 
-                }
-                else fragment += "_fragment" + args[1]; 
-                
-                BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fragment)));
-                
-//              simpleProcess(reader, writer, size);
-                process(reader, writer, size);
-                
-                writer.close(); 
-                reader.close(); 
-        }
-        
-        public static void simpleProcess(BufferedReader reader, BufferedWriter writer, int size) throws IOException {
-                String line; 
-                int index = 0; 
-                while ((line = reader.readLine()) != null) {
-                        if (++index == size) {
-                                index = 0; 
-                                writer.write(line);
-                                writer.newLine(); 
-                        }
-                }
-        }
-        
-        static final String illegalSymbols = ",()'‘";
-        static final String[][] replacedSymbols = new String[][] {
-                {"æ", "ae"}, 
-                {"ø", "o"}, 
-                {"ß", "t"},  
-                {"Ł", "L"}, 
-                {"ı", "i"}, 
-                {"ł", "l"}, 
-                {"–", "-"}, 
-                {"&", "and"}, 
-                {"ð", "o"}, 
-                {"ə", "e"}, 
-                {"Đ", "D"}, 
-                {"ħ", "h"}, 
-//              {"%60", "_"},
-                {"đ", "d"},  
-                {"Þ", "P"}
-        };
-        
-        static Set<Character> symbols2remove;
-        static Map<Character, String> symbols2replace; 
-        
-        static {
-                symbols2remove = new HashSet<Character>();
-                for (int i = 0; i < illegalSymbols.length(); ++i)
-                        symbols2remove.add(illegalSymbols.charAt(i));
-                
-                symbols2replace = new HashMap<Character, String>();
-                for (int i = 0; i < replacedSymbols.length; ++i)
-                        symbols2replace.put(replacedSymbols[i][0].charAt(0), replacedSymbols[i][1]); 
-        }
-        
-        static final String urlSymbols = "http://"; 
-        static final int urlSymbolLength = 7;
-        
-        public static void process(BufferedReader reader, BufferedWriter writer, int size) throws IOException {
-                int index = 0;
-                String line; 
-                
-                String newLine; 
-                while ((line = reader.readLine()) != null) {
-                        if (line.contains("@")) 
-                                continue;
-                        if (++index == size) {
-                                newLine = process(line); 
-                                writer.write(deAccent(newLine.toString()));
-                                writer.write('.');
-                                writer.newLine(); 
-                                index = 0; 
-                        }
-                }
-                
-                writer.close(); 
-                reader.close(); 
-        }
-        private static String process(String line) {
-                line = line.replace("%60", "_");//.replace("__", "_");
-                
-                int inURL = 0;
-                char ch; 
-                String str; 
-                StringBuilder newLine = new StringBuilder(); 
-                for (int i = 0; i < line.length(); ++i) {
-                        ch = line.charAt(i); 
-                        
-                        if (ch == '.') {
-                                if (inURL == urlSymbolLength) 
-                                        newLine.append('.'); 
-                                continue; 
-                        }
-                
-                        if (inURL == urlSymbolLength) {
-                                if (ch == '/' || ch == '#' || ch == ')' || ch == '>') inURL = 0; 
-                        }
-                        else if (ch == urlSymbols.charAt(inURL)) {
-                                ++inURL;
-                        }
-                        else inURL = 0; 
-                        
-                        if ((str = symbols2replace.get(ch)) != null)
-                                newLine.append(str); 
-                        else if (!symbols2remove.contains(ch))
-                                newLine.append(ch); 
-                }
-                
-                return newLine.toString();
-        }
-        public static String deAccent(String str) {
-            String nfdNormalizedString = Normalizer.normalize(str, Normalizer.Form.NFD); 
-            Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+");
-            String t = pattern.matcher(nfdNormalizedString).replaceAll("");
-            return t; 
-        }
-        
-}

diff --git a/test/uk/ac/ox/cs/data/dbpedia/Normaliser.java b/test/uk/ac/ox/cs/data/dbpedia/Normaliser.java deleted file mode 100644 index e025604..0000000 --- a/test/uk/ac/ox/cs/data/dbpedia/Normaliser.java +++ /dev/null
@@ -1,155 +0,0 @@
1	package uk.ac.ox.cs.data.dbpedia;
2
3	import java.io.BufferedReader;
4	import java.io.BufferedWriter;
5	import java.io.FileInputStream;
6	import java.io.FileOutputStream;
7	import java.io.IOException;
8	import java.io.InputStreamReader;
9	import java.io.OutputStreamWriter;
10	import java.text.Normalizer;
11	import java.util.HashMap;
12	import java.util.HashSet;
13	import java.util.Map;
14	import java.util.Set;
15	import java.util.regex.Pattern;
16
17	public class Normaliser {
18
19	public static void main(String[] args) throws IOException {
20	if (args.length == 0) {
21	args = new String[] {
22	"/home/yzhou/ontologies/npd/npd-data-dump-minus-datatype.ttl",
23	"1"
24	};
25	}
26
27	BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(args[0])));
28	String fragment = args[0];
29	int size = Integer.valueOf(args[1]), index;
30
31	if ((index = fragment.lastIndexOf(".")) != -1) {
32	fragment = fragment.substring(0, index) + "_new_fragment" + args[1] + fragment.substring(index);
33	}
34	else fragment += "_fragment" + args[1];
35
36	BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fragment)));
37
38	// simpleProcess(reader, writer, size);
39	process(reader, writer, size);
40
41	writer.close();
42	reader.close();
43	}
44
45	public static void simpleProcess(BufferedReader reader, BufferedWriter writer, int size) throws IOException {
46	String line;
47	int index = 0;
48	while ((line = reader.readLine()) != null) {
49	if (++index == size) {
50	index = 0;
51	writer.write(line);
52	writer.newLine();
53	}
54	}
55	}
56
57	static final String illegalSymbols = ",()'‘";
58	static final String[][] replacedSymbols = new String[][] {
59	{"æ", "ae"},
60	{"ø", "o"},
61	{"ß", "t"},
62	{"Ł", "L"},
63	{"ı", "i"},
64	{"ł", "l"},
65	{"–", "-"},
66	{"&", "and"},
67	{"ð", "o"},
68	{"ə", "e"},
69	{"Đ", "D"},
70	{"ħ", "h"},
71	// {"%60", "_"},
72	{"đ", "d"},
73	{"Þ", "P"}
74	};
75
76	static Set<Character> symbols2remove;
77	static Map<Character, String> symbols2replace;
78
79	static {
80	symbols2remove = new HashSet<Character>();
81	for (int i = 0; i < illegalSymbols.length(); ++i)
82	symbols2remove.add(illegalSymbols.charAt(i));
83
84	symbols2replace = new HashMap<Character, String>();
85	for (int i = 0; i < replacedSymbols.length; ++i)
86	symbols2replace.put(replacedSymbols[i][0].charAt(0), replacedSymbols[i][1]);
87	}
88
89	static final String urlSymbols = "http://";
90	static final int urlSymbolLength = 7;
91
92	public static void process(BufferedReader reader, BufferedWriter writer, int size) throws IOException {
93	int index = 0;
94	String line;
95
96	String newLine;
97	while ((line = reader.readLine()) != null) {
98	if (line.contains("@"))
99	continue;
100
101	if (++index == size) {
102	newLine = process(line);
103	writer.write(deAccent(newLine.toString()));
104	writer.write('.');
105	writer.newLine();
106	index = 0;
107	}
108	}
109
110	writer.close();
111	reader.close();
112	}
113
114	private static String process(String line) {
115	line = line.replace("%60", "_");//.replace("__", "_");
116
117	int inURL = 0;
118	char ch;
119	String str;
120	StringBuilder newLine = new StringBuilder();
121	for (int i = 0; i < line.length(); ++i) {
122	ch = line.charAt(i);
123
124	if (ch == '.') {
125	if (inURL == urlSymbolLength)
126	newLine.append('.');
127	continue;
128	}
129
130	if (inURL == urlSymbolLength) {
131	if (ch == '/' \|\| ch == '#' \|\| ch == ')' \|\| ch == '>') inURL = 0;
132	}
133	else if (ch == urlSymbols.charAt(inURL)) {
134	++inURL;
135	}
136	else inURL = 0;
137
138	if ((str = symbols2replace.get(ch)) != null)
139	newLine.append(str);
140	else if (!symbols2remove.contains(ch))
141	newLine.append(ch);
142	}
143
144	return newLine.toString();
145	}
146
147	public static String deAccent(String str) {
148	String nfdNormalizedString = Normalizer.normalize(str, Normalizer.Form.NFD);
149	Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+");
150	String t = pattern.matcher(nfdNormalizedString).replaceAll("");
151	return t;
152	}
153
154
155	}