Skip to content

Commit e3dfe8b

Browse files
Skip malformed CSV lines in WikiInfo parser (#832)
1 parent 907552e commit e3dfe8b

2 files changed

Lines changed: 102 additions & 62 deletions

File tree

core/src/main/scala/org/dbpedia/extraction/util/WikiInfo.scala

Lines changed: 62 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -20,68 +20,89 @@ class WikiInfo(val wikicode: String, val pages: Int)
2020
object WikiInfo
2121
{
2222
val logger = Logger.getLogger(WikiInfo.getClass.getName)
23+
2324
// hard-coded - there probably is no mirror, and the format is very specific.
24-
// TODO: user might want to use a local file...
25-
// TODO: mayby change this to XML serialization
2625
val URL = new URL("http://wikistats.wmflabs.org/api.php?action=dump&table=wikipedias&format=csv")
27-
28-
// Most browsers would save the file with this name, because s23.org returns it in a http header.
26+
27+
// Most browsers would save the file with this name
2928
val FileName = "wikipedias.csv"
30-
29+
3130
def fromFile(file: File, codec: Codec): Seq[WikiInfo] = {
3231
val source = Source.fromFile(file)(codec)
3332
try fromSource(source) finally source.close
3433
}
35-
34+
3635
def fromURL(url: URL, codec: Codec): Seq[WikiInfo] = {
3736
val source = Source.fromURL(url)(codec)
3837
try fromSource(source) finally source.close
3938
}
40-
41-
def fromSource(source: Source): Seq[WikiInfo] = {
39+
40+
def fromSource(source: Source): Seq[WikiInfo] = {
4241
fromLines(source.getLines)
4342
}
44-
43+
4544
/**
46-
* Retrieves a list of all available Wikipedias from a CSV file like http://s23.org/wikistats/wikipedias_csv.php
47-
*
48-
*/
49-
def fromLines(lines: Iterator[String]): Seq[WikiInfo] = {
45+
* Retrieves a list of all available Wikipedias from a CSV file.
46+
*/
47+
def fromLines(lines: Iterator[String]): Seq[WikiInfo] = {
5048
val info = new ArrayBuffer[WikiInfo]
51-
52-
if (! lines.hasNext) throw new Exception("empty file")
53-
lines.next // skip first line (headers)
54-
55-
for (line <- lines)
56-
if (line.nonEmpty)
57-
fromLine(line) match{
58-
case Some(x) => info += x
59-
case None =>
49+
50+
if (!lines.hasNext) {
51+
logger.warning("wikipedias.csv is empty")
52+
return info
53+
}
54+
55+
lines.next() // skip header
56+
57+
for (line <- lines) {
58+
if (line.nonEmpty) {
59+
fromLine(line) match {
60+
case Some(wikiInfo) => info += wikiInfo
61+
case None => // skip malformed line
6062
}
61-
63+
}
64+
}
65+
6266
info
6367
}
64-
68+
6569
/**
6670
* Reads a WikiInfo object from a single CSV line.
71+
* Malformed lines are logged and skipped.
6772
*/
6873
def fromLine(line: String): Option[WikiInfo] = {
69-
val fields = line.split(",", -1)
70-
71-
if (fields.length < 15) throw new Exception("expected [15] fields, found ["+fields.length+"] in line ["+line+"]")
72-
73-
val pages = try fields(4).toInt
74-
catch { case nfe: NumberFormatException => 0 }
75-
76-
val wikiCode = fields(2)
77-
if (! ConfigUtils.LanguageRegex.pattern.matcher(fields(2)).matches) throw new Exception("expected language code in field with index [2], found line ["+line+"]")
78-
79-
//if(Language.map.keySet.contains(wikiCode))
80-
Option(new WikiInfo(wikiCode, pages))
81-
//else
82-
//{
83-
// logger.log(Level.WARNING, "Language: " + wikiCode + " will be ignored. Add this language to the addonlangs.json file to extract it.")
84-
// None
85-
//}
74+
75+
val fields = line.split(",", -1)
76+
77+
// 1️⃣ Validate field count
78+
if (fields.length < 15) {
79+
logger.warning(
80+
s"Skipping malformed CSV line: expected 15 fields, found ${fields.length}. Line: [$line]"
81+
)
82+
return None
83+
}
84+
85+
// 2️⃣ Parse pages safely
86+
val pages =
87+
try fields(4).toInt
88+
catch {
89+
case _: NumberFormatException =>
90+
logger.warning(
91+
s"Invalid page count in CSV line, defaulting to 0. Line: [$line]"
92+
)
93+
0
94+
}
95+
96+
// 3️⃣ Validate language code
97+
val wikiCode = fields(2)
98+
if (!ConfigUtils.LanguageRegex.pattern.matcher(wikiCode).matches) {
99+
logger.warning(
100+
s"Invalid language code [$wikiCode] in CSV line, skipping. Line: [$line]"
101+
)
102+
return None
103+
}
104+
105+
// 4️⃣ Valid line → create WikiInfo
106+
Some(new WikiInfo(wikiCode, pages))
86107
}
87108
}

server/src/main/scala/org/dbpedia/extraction/server/ExtractionManager.scala

Lines changed: 40 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -148,31 +148,50 @@ abstract class ExtractionManager(
148148

149149
protected def loadMappingPages(language : Language) : Map[WikiTitle, WikiPage] =
150150
{
151-
val namespace = language.wikiCode match {
152-
case "wikidata" =>
153-
Namespace.mappings(Language.English)
154-
case _ =>
155-
Namespace.mappings.getOrElse(language, throw new NoSuchElementException("no mapping namespace for language "+language.wikiCode))
156-
}
151+
val namespaceOpt = language.wikiCode match {
152+
case "wikidata" =>
153+
Some(Namespace.mappings(Language.English))
154+
case _ =>
155+
Namespace.mappings.get(language)
156+
}
157157

158+
if (namespaceOpt.isEmpty) {
159+
logger.warning(
160+
s"No mapping namespace for language ${language.wikiCode} – skipping mapping pages."
161+
)
162+
return Map.empty
163+
}
158164

159-
val source = if (paths.mappingsDir != null && paths.mappingsDir.isDirectory)
160-
{
161-
val file = new File(paths.mappingsDir, namespace.name(Language.Mappings).replace(' ','_')+".xml")
162-
if(!file.exists()) {
163-
logger.warning("MAPPING FILE [" + file + "] DOES NOT EXIST! WILL BE IGNORED")
164-
return Map[WikiTitle, WikiPage]()
165-
}
166-
logger.warning("LOADING MAPPINGS NOT FROM SERVER, BUT FROM LOCAL FILE ["+file+"] - MAY BE OUTDATED - ONLY FOR TESTING!")
167-
XMLSource.fromFile(file, language) // TODO: use Language.Mappings?
168-
}
169-
else
170-
{
171-
val url = paths.apiUrl
172-
WikiSource.fromNamespaces(Set(namespace), url, language) // TODO: use Language.Mappings?
165+
val namespace = namespaceOpt.get
166+
167+
val source = if (paths.mappingsDir != null && paths.mappingsDir.isDirectory)
168+
{
169+
val file = new File(
170+
paths.mappingsDir,
171+
namespace.name(Language.Mappings).replace(' ','_') + ".xml"
172+
)
173+
174+
if (!file.exists()) {
175+
logger.warning(
176+
"MAPPING FILE [" + file + "] DOES NOT EXIST! WILL BE IGNORED"
177+
)
178+
return Map.empty
173179
}
174180

175-
source.map(page => (page.title, page)).toMap
181+
logger.warning(
182+
"LOADING MAPPINGS NOT FROM SERVER, BUT FROM LOCAL FILE [" + file +
183+
"] - MAY BE OUTDATED - ONLY FOR TESTING!"
184+
)
185+
186+
XMLSource.fromFile(file, language)
187+
}
188+
else
189+
{
190+
val url = paths.apiUrl
191+
WikiSource.fromNamespaces(Set(namespace), url, language)
192+
}
193+
194+
source.map(page => (page.title, page)).toMap
176195
}
177196

178197
protected def loadOntology() : Ontology =

0 commit comments

Comments
 (0)