Skip to content

Commit ca8dde6

Browse files
[MOD] HTML Parsing, no parser: return contents as text
1 parent 43b124e commit ca8dde6

6 files changed

Lines changed: 28 additions & 16 deletions

File tree

basex-core/src/main/java/org/basex/build/html/HtmlParser.java

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,14 @@
44
import static org.basex.build.html.HtmlOptions.NOCDATA;
55
import static org.basex.query.QueryError.*;
66
import static org.basex.util.Token.*;
7+
import static org.basex.util.XMLToken.*;
78

89
import java.io.*;
910

1011
import org.basex.build.xml.*;
1112
import org.basex.core.*;
1213
import org.basex.io.*;
14+
import org.basex.io.serial.*;
1315
import org.basex.query.*;
1416
import org.basex.query.value.item.*;
1517
import org.basex.util.*;
@@ -70,8 +72,14 @@ public HtmlParser(final IO source, final Parser parser, final MainOptions option
7072
*/
7173
private static IO toXml(final IO io, final Parser parser, final HtmlOptions hopts)
7274
throws IOException {
73-
// parser unavailable: fall back to XML
74-
if(parser == null) return io;
75+
76+
// parser unavailable: wrap contents as text
77+
if(parser == null) {
78+
final TokenBuilder xml = new TokenBuilder().add(ELEM_O).add(HTML).add(ELEM_C).add(
79+
Serializer.value(io.read(), false, true, false)).add(ELEM_OS).add(HTML).add(ELEM_C);
80+
return new IOContent(xml.finish(), io.name());
81+
}
82+
7583
try {
7684
// define output
7785
final StringWriter sw = new StringWriter();
@@ -273,10 +281,11 @@ public boolean available(final HtmlOptions options) {
273281
};
274282

275283
/** The default parser: TAGSOUP if available, NU if available, {@code null} otherwise. */
276-
public static final Parser DEFAULT;
284+
public static final Parser PARSER;
285+
277286
static {
278287
final HtmlOptions opts = new HtmlOptions();
279-
DEFAULT = TAGSOUP.available(opts) ? TAGSOUP : NU.available(opts) ? NU : null;
288+
PARSER = TAGSOUP.available(opts) ? TAGSOUP : NU.available(opts) ? NU : null;
280289
}
281290

282291
/** String representation. */
@@ -343,7 +352,7 @@ static void ensureAvailable(final String className, final QNm name,
343352
* @return parser (can be {@code null})
344353
*/
345354
public static Parser of(final HtmlOptions options) {
346-
return of(options, Parser.DEFAULT);
355+
return of(options, Parser.PARSER);
347356
}
348357

349358
/**

basex-core/src/main/java/org/basex/io/serial/AdaptiveSerializer.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,8 @@ protected void atomic(final Item item) throws IOException {
126126
printChars(basex ? item.string(null) : ((QNm) item).unique());
127127
} else {
128128
final boolean simple = type == BOOLEAN || type.instanceOf(DECIMAL);
129-
final byte[] value = simple ? Token.token(item) : value(item.string(null), '"', false);
129+
final byte[] value = simple ? Token.token(item) :
130+
value(item.string(null), true, false, false);
130131
if(basex || simple || type.instanceOf(STRING) || type.oneOf(UNTYPED_ATOMIC, ANY_URI)) {
131132
printChars(value);
132133
} else {

basex-core/src/main/java/org/basex/io/serial/Serializer.java

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -656,14 +656,15 @@ private void prepare() throws IOException {
656656
/**
657657
* Serializes the specified value.
658658
* @param value value
659+
* @param quote quote strings
660+
* @param xml serialize as XML string value
659661
* @param chop chop large tokens
660-
* @param quote character for quoting the value; ignored if {@code 0}
661662
* @return value
662663
*/
663-
public static byte[] value(final byte[] value, final char quote, final boolean chop) {
664-
final boolean quoting = quote != 0;
664+
public static byte[] value(final byte[] value, final boolean quote, final boolean xml,
665+
final boolean chop) {
665666
final TokenBuilder tb = new TokenBuilder();
666-
if(quoting) tb.add(quote);
667+
if(quote) tb.add('"');
667668

668669
int c = 0;
669670
for(final TokenParser tp = new TokenParser(value); tp.more(); c++) {
@@ -675,11 +676,12 @@ public static byte[] value(final byte[] value, final char quote, final boolean c
675676
if(cp == '&') tb.add(E_AMP);
676677
else if(cp == '\r') tb.add(E_CR);
677678
else if(cp == '\n') tb.add(E_NL);
678-
else if(cp == quote) tb.add(quote).add(quote);
679+
else if(xml && cp == '<') tb.add(E_LT);
680+
else if(quote && cp == '"') tb.add('"').add('"');
679681
else tb.add(cp);
680682
}
681683

682-
if(quoting) tb.add(quote);
684+
if(quote) tb.add('"');
683685
return tb.finish();
684686
}
685687
}

basex-core/src/main/java/org/basex/query/QueryString.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,7 @@ public String toString() {
237237
* @return string
238238
*/
239239
public static byte[] toValue(final byte[] value) {
240-
return Serializer.value(value, (char) 0, true);
240+
return Serializer.value(value, false, false, true);
241241
}
242242

243243
/**
@@ -246,6 +246,6 @@ public static byte[] toValue(final byte[] value) {
246246
* @return token
247247
*/
248248
public static byte[] toQuoted(final byte[] value) {
249-
return Serializer.value(value, '"', true);
249+
return Serializer.value(value, true, false, true);
250250
}
251251
}

basex-core/src/main/java/org/basex/query/func/html/HtmlParse.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ public Value value(final QueryContext qc) throws QueryException {
2323

2424
@Override
2525
protected final Parser parser() {
26-
return Parser.DEFAULT;
26+
return Parser.PARSER;
2727
}
2828

2929
/**

basex-core/src/main/java/org/basex/query/func/html/HtmlParser.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
public final class HtmlParser extends StandardFunc {
1616
@Override
1717
public Item item(final QueryContext qc, final InputInfo ii) {
18-
final Parser parser = Parser.DEFAULT;
18+
final Parser parser = Parser.PARSER;
1919
return Str.get(parser != null ? parser.toString() : "");
2020
}
2121
}

0 commit comments

Comments
 (0)