Skip to content

Commit dd7e452

Browse files
jcschaffclaude
andcommitted
Add scan-xml-control-chars admin CLI for biomodel/mathmodel CLOBs
Read-only scanner that walks vc_biomodelxml.bmxml and vc_mathmodelxml.mmxml, reports every row with codepoints rejected by XmlChars (C0 controls, unpaired surrogates, non-character codepoints, U+FFFD per project policy). Streams CLOBs through a Reader, caps in-memory size (--max-clob-mb), and uses an autoFlush PrintWriter so output is durable even if the long-running scan is interrupted. Output is TSV with kind/model_id/userid/offset/cp_hex/ snippet so corrupted models can be triaged for repair. Motivated by the two failing biomodels (311226221, 311875206); knowing the full scope of corruption in the prod DB is a prerequisite for choosing between repair-in-place and mark-broken. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 6d90ec5 commit dd7e452

3 files changed

Lines changed: 219 additions & 0 deletions

File tree

vcell-admin/src/main/java/org/vcell/admin/cli/AdminCLI.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import org.vcell.admin.cli.db.DatabaseCompareSchemaCommand;
1010
import org.vcell.admin.cli.db.DatabaseCreateScriptCommand;
1111
import org.vcell.admin.cli.db.DatabaseDestroyAndRecreateCommand;
12+
import org.vcell.admin.cli.db.XmlControlCharScanCommand;
1213
import org.vcell.admin.cli.mathverifier.ModeldbLoadTestCommand;
1314
import org.vcell.admin.cli.mathverifier.ModeldbMathGenTestCommand;
1415
import org.vcell.admin.cli.models.ModelCommands;
@@ -31,6 +32,7 @@
3132
DatabaseCompareSchemaCommand.class,
3233
DatabaseDestroyAndRecreateCommand.class,
3334
DatabaseCreateScriptCommand.class,
35+
XmlControlCharScanCommand.class,
3436
UsageCommand.class,
3537
UsersQueryCommand.class,
3638
ResultSetCrawlerCommand.class,

vcell-admin/src/main/java/org/vcell/admin/cli/CLIDatabaseService.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,4 +181,8 @@ public List<BioModelInfo> queryBiomodelsByOwner(User owner) throws DataAccessExc
181181
public List<MathModelInfo> queryMathmodelsByOwner(User owner) throws DataAccessException {
182182
return Arrays.asList(getDatabaseServer().getMathModelInfos(owner, false));
183183
}
184+
185+
public ConnectionFactory getConnectionFactory() {
186+
return conFactory;
187+
}
184188
}
Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
package org.vcell.admin.cli.db;
2+
3+
import org.vcell.admin.cli.CLIDatabaseService;
4+
import org.vcell.db.ConnectionFactory;
5+
import org.vcell.util.xml.XmlChars;
6+
import picocli.CommandLine.Command;
7+
import picocli.CommandLine.Option;
8+
9+
import java.io.BufferedWriter;
10+
import java.io.OutputStreamWriter;
11+
import java.io.PrintWriter;
12+
import java.io.Reader;
13+
import java.io.Writer;
14+
import java.nio.charset.StandardCharsets;
15+
import java.nio.file.Files;
16+
import java.nio.file.Path;
17+
import java.sql.Clob;
18+
import java.sql.Connection;
19+
import java.sql.PreparedStatement;
20+
import java.sql.ResultSet;
21+
import java.util.concurrent.Callable;
22+
23+
@Command(name = "scan-xml-control-chars",
24+
description = "scan biomodel and mathmodel CLOBs for invalid XML chars (read-only)")
25+
public class XmlControlCharScanCommand implements Callable<Integer> {
26+
27+
@Option(names = "--output", description = "TSV output path (default: xml-bad-chars.tsv)")
28+
private Path output = Path.of("xml-bad-chars.tsv");
29+
30+
@Option(names = "--limit", description = "max rows to scan per kind (0 = no limit)")
31+
private int limit = 0;
32+
33+
@Option(names = "--skip-biomodels", description = "skip vc_biomodelxml")
34+
private boolean skipBiomodels = false;
35+
36+
@Option(names = "--skip-mathmodels", description = "skip vc_mathmodelxml")
37+
private boolean skipMathmodels = false;
38+
39+
@Option(names = "--all-occurrences",
40+
description = "report every bad codepoint (default: only the first per row)")
41+
private boolean allOccurrences = false;
42+
43+
@Option(names = "--progress-every", description = "print progress every N rows (default 500)")
44+
private int progressEvery = 500;
45+
46+
@Option(names = "--snippet-radius", description = "chars of context around the bad codepoint (default 20)")
47+
private int snippetRadius = 20;
48+
49+
@Option(names = "--max-clob-mb", description = "skip CLOBs larger than this many MB (default 64)")
50+
private int maxClobMB = 64;
51+
52+
private static final String BIOMODEL_QUERY =
53+
"SELECT b.id AS model_id, u.userid, x.bmxml AS xml " +
54+
"FROM vc_biomodelxml x " +
55+
"JOIN vc_biomodel b ON x.biomodelref = b.id " +
56+
"JOIN vc_userinfo u ON b.ownerref = u.id";
57+
58+
private static final String MATHMODEL_QUERY =
59+
"SELECT m.id AS model_id, u.userid, x.mmxml AS xml " +
60+
"FROM vc_mathmodelxml x " +
61+
"JOIN vc_mathmodel m ON x.mathmodelref = m.id " +
62+
"JOIN vc_userinfo u ON m.ownerref = u.id";
63+
64+
public Integer call() {
65+
long t0 = System.nanoTime();
66+
System.err.println("scan-xml-control-chars: starting (output=" + output.toAbsolutePath() + ")");
67+
try (CLIDatabaseService cliDb = new CLIDatabaseService();
68+
Writer fileWriter = Files.newBufferedWriter(output, StandardCharsets.UTF_8);
69+
PrintWriter out = new PrintWriter(new BufferedWriter(fileWriter), true /*autoFlush*/)) {
70+
ConnectionFactory cf = cliDb.getConnectionFactory();
71+
out.println("kind\tmodel_id\tuserid\toffset\tcp_hex\tsnippet");
72+
73+
int badRows = 0;
74+
int totalScanned = 0;
75+
if (!skipBiomodels) {
76+
int[] r = scan(cf, "biomodel", BIOMODEL_QUERY, out);
77+
totalScanned += r[0];
78+
badRows += r[1];
79+
}
80+
if (!skipMathmodels) {
81+
int[] r = scan(cf, "mathmodel", MATHMODEL_QUERY, out);
82+
totalScanned += r[0];
83+
badRows += r[1];
84+
}
85+
86+
long ms = (System.nanoTime() - t0) / 1_000_000L;
87+
System.err.println("scan-xml-control-chars: done. scanned=" + totalScanned
88+
+ " bad_rows=" + badRows + " elapsed_ms=" + ms);
89+
return 0;
90+
} catch (Exception e) {
91+
e.printStackTrace(System.err);
92+
return 1;
93+
}
94+
}
95+
96+
private int[] scan(ConnectionFactory cf, String kind, String sql, PrintWriter out) throws Exception {
97+
Object lock = new Object();
98+
Connection con = cf.getConnection(lock);
99+
int scanned = 0;
100+
int badRows = 0;
101+
try (PreparedStatement ps = con.prepareStatement(sql);
102+
ResultSet rs = executeStreaming(ps)) {
103+
while (rs.next()) {
104+
if (limit > 0 && scanned >= limit) break;
105+
scanned++;
106+
long modelId = rs.getLong("model_id");
107+
String userid = rs.getString("userid");
108+
Clob clob = rs.getClob("xml");
109+
if (clob == null) continue;
110+
int hits = scanClob(kind, modelId, userid, clob, out);
111+
if (hits > 0) badRows++;
112+
if (scanned % progressEvery == 0) {
113+
System.err.println("[" + kind + "] scanned=" + scanned + " bad_rows=" + badRows);
114+
}
115+
}
116+
} finally {
117+
cf.release(con, lock);
118+
}
119+
System.err.println("[" + kind + "] complete: scanned=" + scanned + " bad_rows=" + badRows);
120+
return new int[] { scanned, badRows };
121+
}
122+
123+
private static ResultSet executeStreaming(PreparedStatement ps) throws Exception {
124+
// Encourage the driver not to materialize the full result set in memory.
125+
try {
126+
ps.setFetchSize(50);
127+
} catch (Exception ignore) { /* not all drivers support */ }
128+
return ps.executeQuery();
129+
}
130+
131+
/**
132+
* Scan a single CLOB. Returns the number of bad codepoints reported.
133+
*/
134+
private int scanClob(String kind, long modelId, String userid, Clob clob, PrintWriter out)
135+
throws Exception {
136+
long lengthChars;
137+
try {
138+
lengthChars = clob.length();
139+
} catch (Exception e) {
140+
// some drivers/CLOBs don't support length(); fall back to streaming read with a size cap
141+
lengthChars = -1;
142+
}
143+
long maxChars = (long) maxClobMB * 1024L * 1024L / 2L; // chars are 2 bytes
144+
if (lengthChars > maxChars) {
145+
System.err.println("[" + kind + "/" + modelId + "/" + userid
146+
+ "] skipping: CLOB length " + lengthChars + " chars > cap");
147+
return 0;
148+
}
149+
StringBuilder doc = new StringBuilder(lengthChars > 0 ? (int) lengthChars : 16384);
150+
try (Reader r = clob.getCharacterStream()) {
151+
char[] buf = new char[8192];
152+
int n;
153+
while ((n = r.read(buf)) != -1) {
154+
doc.append(buf, 0, n);
155+
if (doc.length() > maxChars) {
156+
System.err.println("[" + kind + "/" + modelId + "/" + userid
157+
+ "] skipping: CLOB length > cap");
158+
return 0;
159+
}
160+
}
161+
}
162+
// doc is now a CharSequence; XmlChars.firstInvalidIndex walks codepoints with proper
163+
// surrogate handling. For all-occurrences we loop ourselves.
164+
int hits = 0;
165+
int from = 0;
166+
while (true) {
167+
int idx = firstInvalidIndexFrom(doc, from);
168+
if (idx < 0) break;
169+
int cp = Character.codePointAt(doc, idx);
170+
String snippet = renderSnippet(doc, idx, cp);
171+
out.printf("%s\t%d\t%s\t%d\t0x%04X\t%s%n",
172+
kind, modelId, userid, idx, cp, snippet);
173+
hits++;
174+
if (!allOccurrences) break;
175+
from = idx + Character.charCount(cp);
176+
}
177+
return hits;
178+
}
179+
180+
private static int firstInvalidIndexFrom(CharSequence s, int from) {
181+
int i = from;
182+
int len = s.length();
183+
while (i < len) {
184+
int cp = Character.codePointAt(s, i);
185+
if (!XmlChars.isValidXml10Char(cp)) return i;
186+
i += Character.charCount(cp);
187+
}
188+
return -1;
189+
}
190+
191+
private String renderSnippet(CharSequence doc, int idx, int cp) {
192+
int radius = Math.max(snippetRadius, 0);
193+
int from = Math.max(0, idx - radius);
194+
int charLen = Character.charCount(cp);
195+
int to = Math.min(doc.length(), idx + charLen + radius);
196+
StringBuilder sb = new StringBuilder();
197+
for (int i = from; i < idx; i++) appendDisplay(sb, doc.charAt(i));
198+
sb.append('[').append(String.format("U+%04X", cp)).append(']');
199+
for (int i = idx + charLen; i < to; i++) appendDisplay(sb, doc.charAt(i));
200+
// strip TSV-breaking whitespace
201+
return sb.toString().replace('\t', ' ').replace('\r', ' ').replace('\n', ' ');
202+
}
203+
204+
private static void appendDisplay(StringBuilder sb, char c) {
205+
if (c == 0xFFFD) {
206+
sb.append("<U+FFFD>");
207+
} else if (c < 0x20 && c != 0x09) {
208+
sb.append(String.format("\\x%02X", (int) c));
209+
} else {
210+
sb.append(c);
211+
}
212+
}
213+
}

0 commit comments

Comments
 (0)