From 9ce65c5a963b03ee97fe9cb6c5aa65a3c04a80a8 Mon Sep 17 00:00:00 2001 From: yzhou Date: Tue, 21 Apr 2015 10:34:27 +0100 Subject: initial version --- test/uk/ac/ox/cs/data/dbpedia/Normaliser.java | 155 ++++++++++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 test/uk/ac/ox/cs/data/dbpedia/Normaliser.java (limited to 'test/uk/ac/ox/cs/data/dbpedia/Normaliser.java') diff --git a/test/uk/ac/ox/cs/data/dbpedia/Normaliser.java b/test/uk/ac/ox/cs/data/dbpedia/Normaliser.java new file mode 100644 index 0000000..e025604 --- /dev/null +++ b/test/uk/ac/ox/cs/data/dbpedia/Normaliser.java @@ -0,0 +1,155 @@ +package uk.ac.ox.cs.data.dbpedia; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.text.Normalizer; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.regex.Pattern; + +public class Normaliser { + + public static void main(String[] args) throws IOException { + if (args.length == 0) { + args = new String[] { + "/home/yzhou/ontologies/npd/npd-data-dump-minus-datatype.ttl", + "1" + }; + } + + BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(args[0]))); + String fragment = args[0]; + int size = Integer.valueOf(args[1]), index; + + if ((index = fragment.lastIndexOf(".")) != -1) { + fragment = fragment.substring(0, index) + "_new_fragment" + args[1] + fragment.substring(index); + } + else fragment += "_fragment" + args[1]; + + BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fragment))); + +// simpleProcess(reader, writer, size); + process(reader, writer, size); + + writer.close(); + reader.close(); + } + + public static void simpleProcess(BufferedReader reader, BufferedWriter writer, int size) throws IOException { + String line; + int index = 0; + while ((line = reader.readLine()) != null) { + if (++index == size) { + index = 0; + writer.write(line); + writer.newLine(); + } + } + } + + static final String illegalSymbols = ",()'‘"; + static final String[][] replacedSymbols = new String[][] { + {"æ", "ae"}, + {"ø", "o"}, + {"ß", "t"}, + {"Ł", "L"}, + {"ı", "i"}, + {"ł", "l"}, + {"–", "-"}, + {"&", "and"}, + {"ð", "o"}, + {"ə", "e"}, + {"Đ", "D"}, + {"ħ", "h"}, +// {"%60", "_"}, + {"đ", "d"}, + {"Þ", "P"} + }; + + static Set symbols2remove; + static Map symbols2replace; + + static { + symbols2remove = new HashSet(); + for (int i = 0; i < illegalSymbols.length(); ++i) + symbols2remove.add(illegalSymbols.charAt(i)); + + symbols2replace = new HashMap(); + for (int i = 0; i < replacedSymbols.length; ++i) + symbols2replace.put(replacedSymbols[i][0].charAt(0), replacedSymbols[i][1]); + } + + static final String urlSymbols = "http://"; + static final int urlSymbolLength = 7; + + public static void process(BufferedReader reader, BufferedWriter writer, int size) throws IOException { + int index = 0; + String line; + + String newLine; + while ((line = reader.readLine()) != null) { + if (line.contains("@")) + continue; + + if (++index == size) { + newLine = process(line); + writer.write(deAccent(newLine.toString())); + writer.write('.'); + writer.newLine(); + index = 0; + } + } + + writer.close(); + reader.close(); + } + + private static String process(String line) { + line = line.replace("%60", "_");//.replace("__", "_"); + + int inURL = 0; + char ch; + String str; + StringBuilder newLine = new StringBuilder(); + for (int i = 0; i < line.length(); ++i) { + ch = line.charAt(i); + + if (ch == '.') { + if (inURL == urlSymbolLength) + newLine.append('.'); + continue; + } + + if (inURL == urlSymbolLength) { + if (ch == '/' || ch == '#' || ch == ')' || ch == '>') inURL = 0; + } + else if (ch == urlSymbols.charAt(inURL)) { + ++inURL; + } + else inURL = 0; + + if ((str = symbols2replace.get(ch)) != null) + newLine.append(str); + else if (!symbols2remove.contains(ch)) + newLine.append(ch); + } + + return newLine.toString(); + } + + public static String deAccent(String str) { + String nfdNormalizedString = Normalizer.normalize(str, Normalizer.Form.NFD); + Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+"); + String t = pattern.matcher(nfdNormalizedString).replaceAll(""); + return t; + } + + +} -- cgit v1.2.3