|
1 | 1 | (ns yamlstar.parser.receiver |
2 | 2 | (:require [clojure.string :as str] |
3 | | - [yamlstar.parser.prelude :refer :all])) |
| 3 | + [yamlstar.parser.prelude :refer :all] |
| 4 | + [yamlstar.parser.parser :as parser])) |
4 | 5 |
|
5 | 6 | ;; Forward declarations |
6 | 7 | (declare push-event check-document-start check-document-end) |
7 | 8 |
|
| 9 | +;; Helper: convert hex string to Unicode character string |
| 10 | +(defn hex->char [hex-val] |
| 11 | + (let [[n _] (strconv.ParseInt hex-val 16 32)] |
| 12 | + (str (go/rune n)))) |
| 13 | + |
8 | 14 | ;; Event constructors |
9 | 15 | (defn stream-start-event [] |
10 | 16 | {:event "stream_start"}) |
|
190 | 196 | ;; hex escapes |
191 | 197 | (re-matches (re-pattern (str "\\\\x(" hex "{2})")) match) |
192 | 198 | (let [[_ hex-val] (re-matches (re-pattern (str "\\\\x(" hex "{2})")) match)] |
193 | | - (str (char (Integer/parseInt hex-val 16)))) |
| 199 | + (hex->char hex-val)) |
194 | 200 |
|
195 | 201 | (re-matches (re-pattern (str "\\\\u(" hex "{4})")) match) |
196 | 202 | (let [[_ hex-val] (re-matches (re-pattern (str "\\\\u(" hex "{4})")) match)] |
197 | | - (str (char (Integer/parseInt hex-val 16)))) |
| 203 | + (hex->char hex-val)) |
198 | 204 |
|
199 | 205 | (re-matches (re-pattern (str "\\\\U(" hex "{8})")) match) |
200 | 206 | (let [[_ hex-val] (re-matches (re-pattern (str "\\\\U(" hex "{8})")) match)] |
201 | | - (String. (Character/toChars (Integer/parseInt hex-val 16)))) |
| 207 | + ;; go/rune handles all Unicode including above U+FFFF |
| 208 | + (let [[n _] (strconv.ParseInt hex-val 16 32)] |
| 209 | + (str (go/rune n)))) |
202 | 210 |
|
203 | 211 | ;; line continuation |
204 | 212 | (re-matches #"(?:\\ ?\r?\n[ \t]*)" match) |
|
440 | 448 | lines (map #(str (:text %) "\n") lines) |
441 | 449 | text (apply str lines) |
442 | 450 | ;; :parser is stored directly (not as atom) in the receiver passed to callbacks |
443 | | - parser (:parser receiver) |
444 | | - state-curr @(requiring-resolve 'yamlstar.parser.parser/state-curr) |
445 | | - t (:t (state-curr parser)) |
| 451 | + p (:parser receiver) |
| 452 | + t (:t (parser/state-curr p)) |
446 | 453 | text (cond |
447 | 454 | (= t "clip") (str/replace text #"\n+$" "\n") |
448 | 455 | (= t "strip") (str/replace text #"\n+$" "") |
|
485 | 492 | (reset! (:in-scalar receiver) false) |
486 | 493 | (let [lines (map :text (cache-drop receiver)) |
487 | 494 | text (str/join "\n" lines) |
| 495 | + ;; RE2 doesn't support lookaheads; capture and reinsert next char |
488 | 496 | text (-> text |
489 | | - (str/replace #"(?m)^(\S.*)\n(?=\S)" "$1 ") |
| 497 | + (str/replace #"(?m)^(\S.*)\n(\S)" "$1 $2") |
490 | 498 | (str/replace #"(?m)^(\S.*)\n(\n+)" "$1$2") |
491 | | - (str/replace #"(?m)^([ \t]+\S.*)\n(\n+)(?=\S)" "$1$2")) |
| 499 | + (str/replace #"(?m)^([ \t]+\S.*)\n(\n+)(\S)" "$1$2$3")) |
492 | 500 | text (str text "\n") |
493 | 501 | ;; :parser is stored directly (not as atom) in the receiver passed to callbacks |
494 | | - parser (:parser receiver) |
495 | | - state-curr @(requiring-resolve 'yamlstar.parser.parser/state-curr) |
496 | | - t (:t (state-curr parser)) |
| 502 | + p (:parser receiver) |
| 503 | + t (:t (parser/state-curr p)) |
497 | 504 | text (cond |
498 | 505 | (= t "clip") (let [t (str/replace text #"\n+$" "\n")] |
499 | 506 | (if (= t "\n") "" t)) |
|
557 | 564 | ;; URL-decode percent escapes |
558 | 565 | resolved-tag (str/replace resolved-tag #"%([0-9a-fA-F]{2})" |
559 | 566 | (fn [[_ hex]] |
560 | | - (str (char (Integer/parseInt hex 16)))))] |
| 567 | + (hex->char hex)))] |
561 | 568 | (reset! (:tag receiver) resolved-tag))) |
562 | 569 |
|
563 | 570 | ;; Alias node |
|
0 commit comments