diff options
| author | yzhou <yzhou@krr-linux.cs.ox.ac.uk> | 2015-04-30 17:36:35 +0100 |
|---|---|---|
| committer | yzhou <yzhou@krr-linux.cs.ox.ac.uk> | 2015-04-30 17:36:35 +0100 |
| commit | 0d8f240c9c0a64f2285324e5a517161e45c698fc (patch) | |
| tree | f4b4f7078e3be02011b9812cd8791c657a135993 /test/uk/ac/ox/cs/data/dbpedia/Normaliser.java | |
| parent | 68ae342b2a4923bc7b3f378c6a489f2355d85279 (diff) | |
| download | ACQuA-0d8f240c9c0a64f2285324e5a517161e45c698fc.tar.gz ACQuA-0d8f240c9c0a64f2285324e5a517161e45c698fc.zip | |
downgrade owl api and reorganised src files
Diffstat (limited to 'test/uk/ac/ox/cs/data/dbpedia/Normaliser.java')
| -rw-r--r-- | test/uk/ac/ox/cs/data/dbpedia/Normaliser.java | 155 |
1 files changed, 0 insertions, 155 deletions
diff --git a/test/uk/ac/ox/cs/data/dbpedia/Normaliser.java b/test/uk/ac/ox/cs/data/dbpedia/Normaliser.java deleted file mode 100644 index e025604..0000000 --- a/test/uk/ac/ox/cs/data/dbpedia/Normaliser.java +++ /dev/null | |||
| @@ -1,155 +0,0 @@ | |||
| 1 | package uk.ac.ox.cs.data.dbpedia; | ||
| 2 | |||
| 3 | import java.io.BufferedReader; | ||
| 4 | import java.io.BufferedWriter; | ||
| 5 | import java.io.FileInputStream; | ||
| 6 | import java.io.FileOutputStream; | ||
| 7 | import java.io.IOException; | ||
| 8 | import java.io.InputStreamReader; | ||
| 9 | import java.io.OutputStreamWriter; | ||
| 10 | import java.text.Normalizer; | ||
| 11 | import java.util.HashMap; | ||
| 12 | import java.util.HashSet; | ||
| 13 | import java.util.Map; | ||
| 14 | import java.util.Set; | ||
| 15 | import java.util.regex.Pattern; | ||
| 16 | |||
| 17 | public class Normaliser { | ||
| 18 | |||
| 19 | public static void main(String[] args) throws IOException { | ||
| 20 | if (args.length == 0) { | ||
| 21 | args = new String[] { | ||
| 22 | "/home/yzhou/ontologies/npd/npd-data-dump-minus-datatype.ttl", | ||
| 23 | "1" | ||
| 24 | }; | ||
| 25 | } | ||
| 26 | |||
| 27 | BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(args[0]))); | ||
| 28 | String fragment = args[0]; | ||
| 29 | int size = Integer.valueOf(args[1]), index; | ||
| 30 | |||
| 31 | if ((index = fragment.lastIndexOf(".")) != -1) { | ||
| 32 | fragment = fragment.substring(0, index) + "_new_fragment" + args[1] + fragment.substring(index); | ||
| 33 | } | ||
| 34 | else fragment += "_fragment" + args[1]; | ||
| 35 | |||
| 36 | BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fragment))); | ||
| 37 | |||
| 38 | // simpleProcess(reader, writer, size); | ||
| 39 | process(reader, writer, size); | ||
| 40 | |||
| 41 | writer.close(); | ||
| 42 | reader.close(); | ||
| 43 | } | ||
| 44 | |||
| 45 | public static void simpleProcess(BufferedReader reader, BufferedWriter writer, int size) throws IOException { | ||
| 46 | String line; | ||
| 47 | int index = 0; | ||
| 48 | while ((line = reader.readLine()) != null) { | ||
| 49 | if (++index == size) { | ||
| 50 | index = 0; | ||
| 51 | writer.write(line); | ||
| 52 | writer.newLine(); | ||
| 53 | } | ||
| 54 | } | ||
| 55 | } | ||
| 56 | |||
| 57 | static final String illegalSymbols = ",()'‘"; | ||
| 58 | static final String[][] replacedSymbols = new String[][] { | ||
| 59 | {"æ", "ae"}, | ||
| 60 | {"ø", "o"}, | ||
| 61 | {"ß", "t"}, | ||
| 62 | {"Ł", "L"}, | ||
| 63 | {"ı", "i"}, | ||
| 64 | {"ł", "l"}, | ||
| 65 | {"–", "-"}, | ||
| 66 | {"&", "and"}, | ||
| 67 | {"ð", "o"}, | ||
| 68 | {"ə", "e"}, | ||
| 69 | {"Đ", "D"}, | ||
| 70 | {"ħ", "h"}, | ||
| 71 | // {"%60", "_"}, | ||
| 72 | {"đ", "d"}, | ||
| 73 | {"Þ", "P"} | ||
| 74 | }; | ||
| 75 | |||
| 76 | static Set<Character> symbols2remove; | ||
| 77 | static Map<Character, String> symbols2replace; | ||
| 78 | |||
| 79 | static { | ||
| 80 | symbols2remove = new HashSet<Character>(); | ||
| 81 | for (int i = 0; i < illegalSymbols.length(); ++i) | ||
| 82 | symbols2remove.add(illegalSymbols.charAt(i)); | ||
| 83 | |||
| 84 | symbols2replace = new HashMap<Character, String>(); | ||
| 85 | for (int i = 0; i < replacedSymbols.length; ++i) | ||
| 86 | symbols2replace.put(replacedSymbols[i][0].charAt(0), replacedSymbols[i][1]); | ||
| 87 | } | ||
| 88 | |||
| 89 | static final String urlSymbols = "http://"; | ||
| 90 | static final int urlSymbolLength = 7; | ||
| 91 | |||
| 92 | public static void process(BufferedReader reader, BufferedWriter writer, int size) throws IOException { | ||
| 93 | int index = 0; | ||
| 94 | String line; | ||
| 95 | |||
| 96 | String newLine; | ||
| 97 | while ((line = reader.readLine()) != null) { | ||
| 98 | if (line.contains("@")) | ||
| 99 | continue; | ||
| 100 | |||
| 101 | if (++index == size) { | ||
| 102 | newLine = process(line); | ||
| 103 | writer.write(deAccent(newLine.toString())); | ||
| 104 | writer.write('.'); | ||
| 105 | writer.newLine(); | ||
| 106 | index = 0; | ||
| 107 | } | ||
| 108 | } | ||
| 109 | |||
| 110 | writer.close(); | ||
| 111 | reader.close(); | ||
| 112 | } | ||
| 113 | |||
| 114 | private static String process(String line) { | ||
| 115 | line = line.replace("%60", "_");//.replace("__", "_"); | ||
| 116 | |||
| 117 | int inURL = 0; | ||
| 118 | char ch; | ||
| 119 | String str; | ||
| 120 | StringBuilder newLine = new StringBuilder(); | ||
| 121 | for (int i = 0; i < line.length(); ++i) { | ||
| 122 | ch = line.charAt(i); | ||
| 123 | |||
| 124 | if (ch == '.') { | ||
| 125 | if (inURL == urlSymbolLength) | ||
| 126 | newLine.append('.'); | ||
| 127 | continue; | ||
| 128 | } | ||
| 129 | |||
| 130 | if (inURL == urlSymbolLength) { | ||
| 131 | if (ch == '/' || ch == '#' || ch == ')' || ch == '>') inURL = 0; | ||
| 132 | } | ||
| 133 | else if (ch == urlSymbols.charAt(inURL)) { | ||
| 134 | ++inURL; | ||
| 135 | } | ||
| 136 | else inURL = 0; | ||
| 137 | |||
| 138 | if ((str = symbols2replace.get(ch)) != null) | ||
| 139 | newLine.append(str); | ||
| 140 | else if (!symbols2remove.contains(ch)) | ||
| 141 | newLine.append(ch); | ||
| 142 | } | ||
| 143 | |||
| 144 | return newLine.toString(); | ||
| 145 | } | ||
| 146 | |||
| 147 | public static String deAccent(String str) { | ||
| 148 | String nfdNormalizedString = Normalizer.normalize(str, Normalizer.Form.NFD); | ||
| 149 | Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+"); | ||
| 150 | String t = pattern.matcher(nfdNormalizedString).replaceAll(""); | ||
| 151 | return t; | ||
| 152 | } | ||
| 153 | |||
| 154 | |||
| 155 | } | ||
