|
| 1 | +import java.io.FileReader; |
1 | 2 | import java.io.IOException; |
| 3 | +import java.nio.file.Path; |
| 4 | +import java.util.ArrayList; |
| 5 | +import java.util.HashMap; |
2 | 6 | import java.util.List; |
| 7 | +import java.util.Map; |
3 | 8 |
|
4 | 9 | import com.formulasearchengine.mathmltools.gold.GoldStandardLoader; |
| 10 | +import com.formulasearchengine.mathmltools.gold.GoldUtils; |
5 | 11 | import com.formulasearchengine.mathmltools.gold.pojo.JsonGouldiBean; |
6 | 12 | import com.formulasearchengine.mathmltools.io.XmlDocumentReader; |
7 | 13 | import com.formulasearchengine.mathmltools.mml.CMMLInfo; |
8 | 14 | import com.formulasearchengine.mathmltools.mml.MathDoc; |
9 | 15 | import com.formulasearchengine.mathmltools.utils.mml.CSymbol; |
10 | | -import com.google.common.collect.HashMultiset; |
| 16 | +import com.google.common.collect.Multiset; |
11 | 17 | import com.google.common.collect.TreeMultiset; |
| 18 | +import org.apache.commons.csv.CSVFormat; |
| 19 | +import org.apache.commons.csv.CSVRecord; |
12 | 20 | import org.w3c.dom.Document; |
13 | 21 | import org.xml.sax.SAXException; |
14 | 22 |
|
15 | 23 | public class SymbolListExample { |
16 | 24 | private SymbolListExample() { |
17 | 25 | } |
18 | 26 |
|
19 | | - public static void main(String[] args) throws IOException, SAXException { |
| 27 | + public static void main(String[] args) throws Exception { |
20 | 28 | final GoldStandardLoader gold = GoldStandardLoader.getInstance(); |
21 | 29 | gold.initLocally(); |
22 | | - final HashMultiset<CSymbol> allSymbols = HashMultiset.create(); |
| 30 | + final Map<String, Integer> omcdMap = new HashMap<>(); |
| 31 | + final TreeMultiset<CSymbol> allSymbols = getcSymbols(gold); |
| 32 | + final TreeMultiset<String> normalizedSymbols = TreeMultiset.create(); |
| 33 | + |
| 34 | + System.out.println(allSymbols); |
| 35 | + readOmCdMap(omcdMap); |
| 36 | + normalizeSymbols(omcdMap, allSymbols, normalizedSymbols); |
| 37 | + |
| 38 | + System.out.println(normalizedSymbols); |
| 39 | + |
| 40 | + |
| 41 | + } |
| 42 | + |
| 43 | + private static void normalizeSymbols(Map<String, Integer> omcdMap, TreeMultiset<CSymbol> allSymbols, TreeMultiset<String> normalizedSymbols) { |
| 44 | + for (Multiset.Entry<CSymbol> cSymbolEntry : allSymbols.entrySet()) { |
| 45 | + final String elem = cSymbolEntry.getElement().toString(); |
| 46 | + if (omcdMap.containsKey(elem)) { |
| 47 | + normalizedSymbols.add("wikidata:Q" + omcdMap.get(elem), cSymbolEntry.getCount()); |
| 48 | + continue; |
| 49 | + } |
| 50 | + if (cSymbolEntry.getElement().getCd().equals("latexml")) { |
| 51 | + if (cSymbolEntry.getElement().getCName().startsWith("Q")) { |
| 52 | + normalizedSymbols.add("wikidata:" + cSymbolEntry.getElement().getCName(), cSymbolEntry.getCount()); |
| 53 | + continue; |
| 54 | + } |
| 55 | + } |
| 56 | + normalizedSymbols.add(elem, cSymbolEntry.getCount()); |
| 57 | + } |
| 58 | + } |
| 59 | + |
| 60 | + public static void readOmCdMap(Map<String, Integer> omcdMap) throws IOException { |
| 61 | + final Path goldPath = GoldStandardLoader.getInstance().getGoldPath(); |
| 62 | + |
| 63 | + FileReader in = new FileReader(goldPath.resolve("../doc/openMathSymbols.csv").toFile()); |
| 64 | + Iterable<CSVRecord> records = CSVFormat.RFC4180.parse(in); |
| 65 | + for (CSVRecord record : records) { |
| 66 | + String omcd = record.get(0); |
| 67 | + String wikidata = record.get(2); |
| 68 | + int wikidataInt = Integer.parseInt(wikidata.replaceAll("Q(\\d+)", "$1")); |
| 69 | + omcdMap.put(omcd, wikidataInt); |
| 70 | + } |
| 71 | + } |
| 72 | + |
| 73 | + public static TreeMultiset<CSymbol> getcSymbols(GoldStandardLoader gold) throws IOException { |
| 74 | + final TreeMultiset<CSymbol> allSymbols = TreeMultiset.create(); |
23 | 75 | // TODO file a bug that gold should implement iterable |
24 | | - for (int i = 1; i < 3; i++) { |
| 76 | + for (int i = 1; i < 305; i++) { |
25 | 77 | final JsonGouldiBean gouldiJson = gold.getGouldiJson(i); |
26 | | - final List<CSymbol> cSymbols = getCSymbols(gouldiJson); |
| 78 | + List<CSymbol> cSymbols; |
| 79 | + try { |
| 80 | + cSymbols = getCSymbols(gouldiJson); |
| 81 | + } catch (SAXException e) { |
| 82 | + cSymbols = fixException(i, e, gouldiJson); |
| 83 | + } |
27 | 84 | TreeMultiset<CSymbol> currentSymbols = TreeMultiset.create(cSymbols); |
28 | 85 | allSymbols.addAll(currentSymbols); |
29 | | - System.out.println(currentSymbols); |
30 | 86 | } |
| 87 | + return allSymbols; |
| 88 | + } |
| 89 | + |
| 90 | + private static List<CSymbol> fixException(int i, SAXException e, JsonGouldiBean gouldiJson) { |
| 91 | + List<CSymbol> cSymbols; |
| 92 | + cSymbols = new ArrayList<>(); |
| 93 | + if (e.getMessage().equals("Attribute \"xmlns:m\" must be declared for element type \"power\".")) { |
| 94 | + final String newMml = gouldiJson.getMml().replaceAll("xmlns:m=\"http://www.w3.org/1998/Math/MathML\"", ""); |
| 95 | + gouldiJson.setMml(newMml); |
| 96 | + final Path goldPath = GoldStandardLoader.getInstance().getGoldPath(); |
31 | 97 |
|
| 98 | + GoldUtils.writeGoldFile(goldPath.resolve(i + ".json"), gouldiJson); |
| 99 | + } |
| 100 | +// System.out.println(gouldiJson.getMml()); |
| 101 | +// System.out.println(e.getMessage()); |
| 102 | +// System.err.println(i); |
| 103 | + return cSymbols; |
32 | 104 | } |
33 | 105 |
|
| 106 | + |
34 | 107 | private static List<CSymbol> getCSymbols(JsonGouldiBean gouldiJson) throws IOException, SAXException { |
35 | 108 | final String mmlString = gouldiJson.getMml(); |
36 | 109 | final Document doc = XmlDocumentReader.parse(mmlString); |
|
0 commit comments