Skip to content

Commit a1eccec

Browse files
committed
Merge branch 'add-converters' of github.com:ag-gipp/MathMLTools into add-converters
2 parents 3677552 + 94dfda5 commit a1eccec

8 files changed

Lines changed: 213 additions & 19 deletions

File tree

mathml-core/src/main/java/com/formulasearchengine/mathmltools/mml/MathDoc.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,7 @@ public List<CSymbol> getCSymbols() {
237237
if (cSymbols == null) {
238238
final IterableNodeList nodeList = new IterableNodeList(dom.getElementsByTagName("csymbol"));
239239
cSymbols = new ArrayList<>();
240-
nodeList.forEach(n -> cSymbols.add(new CSymbol((Element) n, false)));
240+
nodeList.forEach(n -> cSymbols.add(new CSymbol((Element) n)));
241241
}
242242
return cSymbols;
243243
}

mathml-gold/src/main/java/com/formulasearchengine/mathmltools/gold/GoldStandardLoader.java

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -73,11 +73,7 @@ public void init() {
7373
}
7474

7575
public int initLocally() {
76-
props = ConfigLoader.CONFIG;
77-
max = Integer.parseInt(props.getProperty(ConfigLoader.GOULDI_MAXIMUM_NUM));
78-
79-
String goldPath = props.getProperty(ConfigLoader.GOULDI_LOCAL_PATH);
80-
Path path = Paths.get(goldPath);
76+
Path path = getGoldPath();
8177
gouldi = new JsonGouldiBean[max];
8278

8379
ExecutorService executor = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors() * 2);
@@ -97,6 +93,14 @@ public int initLocally() {
9793
return max;
9894
}
9995

96+
public Path getGoldPath() {
97+
props = ConfigLoader.CONFIG;
98+
max = Integer.parseInt(props.getProperty(ConfigLoader.GOULDI_MAXIMUM_NUM));
99+
100+
String goldPath = props.getProperty(ConfigLoader.GOULDI_LOCAL_PATH);
101+
return Paths.get(goldPath);
102+
}
103+
100104
public GitHubFileResponse getResponseFromGouldiRequest(int number) {
101105
String file = number + ".json";
102106
return rest.getForObject(

mathml-utils/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,4 +27,4 @@
2727
<version>${commons.io.version}</version>
2828
</dependency>
2929
</dependencies>
30-
</project>
30+
</project>

mathml-utils/src/main/java/com/formulasearchengine/mathmltools/utils/mml/CSymbol.java

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,10 @@
44
import org.w3c.dom.Element;
55

66
public class CSymbol implements Comparable<CSymbol> {
7-
private boolean strict;
87
private Element n;
8+
private static final String SERIALIZATION_SEPARATOR = ":";
99

10-
public CSymbol(Element n, boolean strict) {
11-
this.strict = strict;
10+
public CSymbol(Element n) {
1211
this.n = n;
1312
}
1413

@@ -26,7 +25,7 @@ public void setCd(String cd) {
2625

2726
@Override
2827
public String toString() {
29-
return getCd() + ":" + getCName();
28+
return getCd() + SERIALIZATION_SEPARATOR + getCName();
3029
}
3130

3231
@Override

xamples/MathMLben

xamples/pom.xml

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
</parent>
1010
<modelVersion>4.0.0</modelVersion>
1111

12-
<groupId>com.formulasearchengine.mathmltools</groupId>
1312
<artifactId>xamples</artifactId>
1413
<dependencies>
1514
<dependency>
@@ -18,12 +17,27 @@
1817
<version>2.0.2-SNAPSHOT</version>
1918
<scope>compile</scope>
2019
</dependency>
20+
<dependency>
21+
<groupId>org.wikidata.wdtk</groupId>
22+
<artifactId>wdtk-wikibaseapi</artifactId>
23+
<version>0.9.0</version>
24+
</dependency>
2125
<dependency>
2226
<groupId>com.formulasearchengine.mathmltools</groupId>
2327
<artifactId>mathml-core</artifactId>
2428
<version>2.0.2-SNAPSHOT</version>
2529
<scope>compile</scope>
2630
</dependency>
31+
<dependency>
32+
<groupId>org.apache.commons</groupId>
33+
<artifactId>commons-csv</artifactId>
34+
<version>1.5</version>
35+
</dependency>
36+
<dependency>
37+
<groupId>org.apache.commons</groupId>
38+
<artifactId>commons-collections4</artifactId>
39+
<version>4.1</version>
40+
</dependency>
2741
</dependencies>
2842

2943

xamples/src/main/java/SymbolListExample.java

Lines changed: 79 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,109 @@
1+
import java.io.FileReader;
12
import java.io.IOException;
3+
import java.nio.file.Path;
4+
import java.util.ArrayList;
5+
import java.util.HashMap;
26
import java.util.List;
7+
import java.util.Map;
38

49
import com.formulasearchengine.mathmltools.gold.GoldStandardLoader;
10+
import com.formulasearchengine.mathmltools.gold.GoldUtils;
511
import com.formulasearchengine.mathmltools.gold.pojo.JsonGouldiBean;
612
import com.formulasearchengine.mathmltools.io.XmlDocumentReader;
713
import com.formulasearchengine.mathmltools.mml.CMMLInfo;
814
import com.formulasearchengine.mathmltools.mml.MathDoc;
915
import com.formulasearchengine.mathmltools.utils.mml.CSymbol;
10-
import com.google.common.collect.HashMultiset;
16+
import com.google.common.collect.Multiset;
1117
import com.google.common.collect.TreeMultiset;
18+
import org.apache.commons.csv.CSVFormat;
19+
import org.apache.commons.csv.CSVRecord;
1220
import org.w3c.dom.Document;
1321
import org.xml.sax.SAXException;
1422

1523
public class SymbolListExample {
1624
private SymbolListExample() {
1725
}
1826

19-
public static void main(String[] args) throws IOException, SAXException {
27+
public static void main(String[] args) throws Exception {
2028
final GoldStandardLoader gold = GoldStandardLoader.getInstance();
2129
gold.initLocally();
22-
final HashMultiset<CSymbol> allSymbols = HashMultiset.create();
30+
final Map<String, Integer> omcdMap = new HashMap<>();
31+
final TreeMultiset<CSymbol> allSymbols = getcSymbols(gold);
32+
final TreeMultiset<String> normalizedSymbols = TreeMultiset.create();
33+
34+
System.out.println(allSymbols);
35+
readOmCdMap(omcdMap);
36+
normalizeSymbols(omcdMap, allSymbols, normalizedSymbols);
37+
38+
System.out.println(normalizedSymbols);
39+
40+
41+
}
42+
43+
private static void normalizeSymbols(Map<String, Integer> omcdMap, TreeMultiset<CSymbol> allSymbols, TreeMultiset<String> normalizedSymbols) {
44+
for (Multiset.Entry<CSymbol> cSymbolEntry : allSymbols.entrySet()) {
45+
final String elem = cSymbolEntry.getElement().toString();
46+
if (omcdMap.containsKey(elem)) {
47+
normalizedSymbols.add("wikidata:Q" + omcdMap.get(elem), cSymbolEntry.getCount());
48+
continue;
49+
}
50+
if (cSymbolEntry.getElement().getCd().equals("latexml")) {
51+
if (cSymbolEntry.getElement().getCName().startsWith("Q")) {
52+
normalizedSymbols.add("wikidata:" + cSymbolEntry.getElement().getCName(), cSymbolEntry.getCount());
53+
continue;
54+
}
55+
}
56+
normalizedSymbols.add(elem, cSymbolEntry.getCount());
57+
}
58+
}
59+
60+
public static void readOmCdMap(Map<String, Integer> omcdMap) throws IOException {
61+
final Path goldPath = GoldStandardLoader.getInstance().getGoldPath();
62+
63+
FileReader in = new FileReader(goldPath.resolve("../doc/openMathSymbols.csv").toFile());
64+
Iterable<CSVRecord> records = CSVFormat.RFC4180.parse(in);
65+
for (CSVRecord record : records) {
66+
String omcd = record.get(0);
67+
String wikidata = record.get(2);
68+
int wikidataInt = Integer.parseInt(wikidata.replaceAll("Q(\\d+)", "$1"));
69+
omcdMap.put(omcd, wikidataInt);
70+
}
71+
}
72+
73+
public static TreeMultiset<CSymbol> getcSymbols(GoldStandardLoader gold) throws IOException {
74+
final TreeMultiset<CSymbol> allSymbols = TreeMultiset.create();
2375
// TODO file a bug that gold should implement iterable
24-
for (int i = 1; i < 3; i++) {
76+
for (int i = 1; i < 305; i++) {
2577
final JsonGouldiBean gouldiJson = gold.getGouldiJson(i);
26-
final List<CSymbol> cSymbols = getCSymbols(gouldiJson);
78+
List<CSymbol> cSymbols;
79+
try {
80+
cSymbols = getCSymbols(gouldiJson);
81+
} catch (SAXException e) {
82+
cSymbols = fixException(i, e, gouldiJson);
83+
}
2784
TreeMultiset<CSymbol> currentSymbols = TreeMultiset.create(cSymbols);
2885
allSymbols.addAll(currentSymbols);
29-
System.out.println(currentSymbols);
3086
}
87+
return allSymbols;
88+
}
89+
90+
private static List<CSymbol> fixException(int i, SAXException e, JsonGouldiBean gouldiJson) {
91+
List<CSymbol> cSymbols;
92+
cSymbols = new ArrayList<>();
93+
if (e.getMessage().equals("Attribute \"xmlns:m\" must be declared for element type \"power\".")) {
94+
final String newMml = gouldiJson.getMml().replaceAll("xmlns:m=\"http://www.w3.org/1998/Math/MathML\"", "");
95+
gouldiJson.setMml(newMml);
96+
final Path goldPath = GoldStandardLoader.getInstance().getGoldPath();
3197

98+
GoldUtils.writeGoldFile(goldPath.resolve(i + ".json"), gouldiJson);
99+
}
100+
// System.out.println(gouldiJson.getMml());
101+
// System.out.println(e.getMessage());
102+
// System.err.println(i);
103+
return cSymbols;
32104
}
33105

106+
34107
private static List<CSymbol> getCSymbols(JsonGouldiBean gouldiJson) throws IOException, SAXException {
35108
final String mmlString = gouldiJson.getMml();
36109
final Document doc = XmlDocumentReader.parse(mmlString);
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
import java.io.FileReader;
2+
import java.io.IOException;
3+
import java.io.UnsupportedEncodingException;
4+
import java.net.URLEncoder;
5+
import java.nio.file.Path;
6+
import java.util.ArrayList;
7+
import java.util.Map;
8+
import java.util.TreeMap;
9+
10+
import com.formulasearchengine.mathmltools.gold.GoldStandardLoader;
11+
import org.apache.commons.collections4.BidiMap;
12+
import org.apache.commons.collections4.bidimap.DualHashBidiMap;
13+
import org.apache.commons.csv.CSVFormat;
14+
import org.apache.commons.csv.CSVRecord;
15+
import org.wikidata.wdtk.datamodel.interfaces.EntityDocument;
16+
import org.wikidata.wdtk.datamodel.interfaces.ItemDocument;
17+
import org.wikidata.wdtk.datamodel.interfaces.MonolingualTextValue;
18+
import org.wikidata.wdtk.datamodel.interfaces.SiteLink;
19+
import org.wikidata.wdtk.wikibaseapi.WikibaseDataFetcher;
20+
import org.wikidata.wdtk.wikibaseapi.apierrors.MediaWikiApiErrorException;
21+
22+
public class WikidataExample {
23+
24+
private static final WikibaseDataFetcher FETCHER = WikibaseDataFetcher.getWikidataDataFetcher();
25+
private static final BidiMap<String, Integer> omCDMap = new DualHashBidiMap<>();
26+
private static final Map<String, String> allSymbols = new TreeMap<>();
27+
private static final String LANG = "en";
28+
private static final String prefix = "wikidata:Q";
29+
30+
public static void main(String[] args) throws MediaWikiApiErrorException, IOException {
31+
SymbolListExample.readOmCdMap(omCDMap);
32+
final WikidataExample example = new WikidataExample();
33+
//final String omCd = example.getOmCd(example.getItem(1226939));
34+
//System.out.println(omCd);
35+
example.processFile();
36+
37+
}
38+
39+
private ItemDocument getItem(long qId) throws MediaWikiApiErrorException {
40+
return (ItemDocument) FETCHER.getEntityDocument("Q" + qId);
41+
}
42+
43+
private String getOmCd(ItemDocument item) throws UnsupportedEncodingException {
44+
45+
final long revisionId = item.getRevisionId();
46+
final MonolingualTextValue label = item.getLabels().get(LANG);
47+
final MonolingualTextValue description = item.getDescriptions().get(LANG);
48+
final SiteLink siteLink = item.getSiteLinks().get(LANG + "wiki");
49+
final String qIdString = item.getEntityId().getId();
50+
String descr = "";
51+
String sortkey = "";
52+
if (label != null) {
53+
final String labelText = label.getText();
54+
descr += labelText + "\n";
55+
final String[] split = labelText.split(" ");
56+
sortkey = split[split.length - 1];
57+
}
58+
if (description != null) {
59+
descr += description.getText() + "\n";
60+
}
61+
if (siteLink != null) {
62+
descr += "https://" + LANG + ".wikipedia.org/w/index.php?title=" + URLEncoder.encode(siteLink.getPageTitle(), "utf8") + "\n";
63+
}
64+
Integer qId = Integer.valueOf(qIdString.substring(1));
65+
if (omCDMap.containsValue(qId)) {
66+
descr += "See also " + omCDMap.getKey(qId) + "\n";
67+
}
68+
descr += "\n This description was generated from http://www.wikidata.org/w/index.php?oldid=" + revisionId;
69+
descr = " <CDDefinition>\n"
70+
+ " <Name>" + qIdString + "</Name>\n"
71+
+ " <Role>application</Role>\n"
72+
+ " <Description>\n" + descr
73+
+ " </Description>\n"
74+
+ "</CDDefinition>";
75+
sortkey += qId;
76+
allSymbols.put(sortkey, descr);
77+
return descr;
78+
79+
}
80+
81+
private void processFile() throws IOException, MediaWikiApiErrorException {
82+
final Path goldPath = GoldStandardLoader.getInstance().getGoldPath();
83+
84+
FileReader in = new FileReader(goldPath.resolve("../doc/wiki-cd-freqs.csv").toFile());
85+
Iterable<CSVRecord> records = CSVFormat.RFC4180.parse(in);
86+
final ArrayList<String> qIds = new ArrayList<>();
87+
for (CSVRecord record : records) {
88+
String symbol = record.get(0);
89+
if (symbol.startsWith(prefix)) {
90+
final int qId = Integer.parseInt(symbol.substring(prefix.length()));
91+
qIds.add("Q" + qId);
92+
}
93+
}
94+
final Map<String, EntityDocument> entityDocuments = FETCHER.getEntityDocuments(qIds);
95+
for (Map.Entry<String, EntityDocument> entry : entityDocuments.entrySet()) {
96+
getOmCd((ItemDocument) entry.getValue());
97+
}
98+
for (String s : allSymbols.keySet()) {
99+
System.out.println(allSymbols.get(s));
100+
}
101+
102+
}
103+
104+
}

0 commit comments

Comments
 (0)