Skip to content

Commit 0bf9d9d

Browse files
authored
Merge pull request #38 from mseri/fix-html-entitites
Avoid uneacaped html entitites from doi api output
2 parents 33c9e2b + b583c5b commit 0bf9d9d

12 files changed

Lines changed: 118 additions & 30 deletions

File tree

bibfmt/bin/bibfmt.ml

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ let bibfmt out strict single_line quiet verbose force files =
5555
"Warning: No valid BibTeX entries found in the file.\n%!";
5656
combined_content)
5757
else
58-
let options =
58+
let options =
5959
{ Bibtex.default_options with strict; single_line }
6060
in
6161
Bibtex.pretty_print_bibtex ~options parse_result.items)
@@ -114,7 +114,8 @@ let () =
114114
in
115115
let single_line =
116116
let doc =
117-
"Force field values onto a single line by replacing newlines with a space."
117+
"Force field values onto a single line by replacing newlines with a \
118+
space."
118119
in
119120
Arg.(value & flag & info [ "l"; "single-line" ] ~doc)
120121
in
@@ -141,7 +142,10 @@ let () =
141142
Arg.(value & pos_all string [] & info [] ~docv:"FILES" ~doc)
142143
in
143144
let bibfmt_t =
144-
Term.(ret (const bibfmt $ out $ strict $ single_line $ quiet $ verbose $ force $ files))
145+
Term.(
146+
ret
147+
(const bibfmt $ out $ strict $ single_line $ quiet $ verbose $ force
148+
$ files))
145149
in
146150
let info =
147151
let doc = "A little CLI tool to pretty print bibtex files." in

bibfmt/lib/bibtex.ml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -677,9 +677,8 @@ let format_entry options entry =
677677
|> List.filter (function
678678
| Field f -> (
679679
match f.value with
680-
| QuotedStringValue s
681-
| BracedStringValue s
682-
| UnquotedStringValue s ->
680+
| QuotedStringValue s | BracedStringValue s | UnquotedStringValue s
681+
->
683682
String.length (String.trim s) > 0
684683
| NumberValue _ -> true)
685684
| EntryComment _ -> true)
@@ -706,7 +705,9 @@ let format_entry options entry =
706705
let contents_str =
707706
if filtered_contents = [] then ""
708707
else
709-
let formatted_contents = List.map format_entry_content' filtered_contents in
708+
let formatted_contents =
709+
List.map format_entry_content' filtered_contents
710+
in
710711
let rec add_commas_except_last = function
711712
| [] -> []
712713
| [ last ] -> [ last ] (* No comma for the last item *)

bibfmt/lib/bibtex.mli

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -150,12 +150,14 @@ val format_field_value_with_url_unescaping :
150150
@return String representation with URLs unescaped if applicable *)
151151

152152
val format_field : bool -> bool -> field -> string
153-
(** [format_field capitalized single_line field] formats a complete field (name = value).
153+
(** [format_field capitalized single_line field] formats a complete field (name
154+
= value).
154155
@param field The field to format
155156
@return String representation of the field *)
156157

157158
val format_entry_content : bool -> bool -> entry_content -> string
158-
(** [format_entry_content capitalized single_line content] formats entry content (field or comment).
159+
(** [format_entry_content capitalized single_line content] formats entry content
160+
(field or comment).
159161
@param content The entry content to format
160162
@return String representation of the content *)
161163

@@ -201,13 +203,14 @@ val find_duplicate_groups :
201203
Each group contains at least 2 entries that match on the specified keys. *)
202204

203205
val string_of_field_value : field_value -> string
204-
(** [string_of_field_value fv] converts a field value to its string representation.
206+
(** [string_of_field_value fv] converts a field value to its string
207+
representation.
205208
@param fv The field value to convert
206209
@return String representation of the field value *)
207210

208211
val make_field : string -> string -> entry_content
209-
(** [make_field name value] creates a BibTeX field with the given name and value.
210-
The value is wrapped in braces.
212+
(** [make_field name value] creates a BibTeX field with the given name and
213+
value. The value is wrapped in braces.
211214
@param name Field name
212215
@param value Field value as a string
213216
@return An entry_content Field with a BracedStringValue *)

doi2bib.opam

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,8 @@ build: [
2323
dev-repo: "git+https://github.com/mseri/doi2bib.git"
2424
depends: [
2525
"dune" {>= "3.0"}
26-
"ocaml" {>= "4.08"}
26+
"ocaml" {>= "4.14.0"}
2727
"bibfmt" {= version}
28-
"astring" {>= "0.8.0"}
2928
"cohttp-lwt-unix" {>= "2.5.0"}
3029
"cmdliner" {>= "1.1.0"}
3130
"clz" {>= "0.1.0"}

doi2bib.opam.template

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
depends: [
2-
"dune" {>= "2.8"}
3-
"ocaml" {>= "4.08"}
2+
"dune" {>= "3.0"}
3+
"ocaml" {>= "4.14.0"}
44
"bibfmt" {= version}
5-
"astring" {>= "0.8.0"}
65
"cohttp-lwt-unix" {>= "2.5.0"}
76
"cmdliner" {>= "1.1.0"}
87
"clz" {>= "0.1.0"}

doi2bib/bin/doi2bib.ml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@ let process_id outfile id =
66
let open Lwt.Syntax in
77
let* bibtex = Http.get_bib_entry @@ Parser.parse_id id in
88

9-
let parsed_items = Bibtex.parse_bibtex bibtex in
9+
let parsed_items =
10+
Bibtex.parse_bibtex bibtex |> List.map Helpers.clean_item
11+
in
1012
let formatted =
1113
if List.length parsed_items = 0 then (
1214
Printf.eprintf
@@ -88,7 +90,7 @@ let process_file outfile infile =
8890
let write_out () =
8991
let bibtex_out = Buffer.contents bibtex_buffer in
9092
let open Bibtex in
91-
let parsed_items = parse_bibtex bibtex_out in
93+
let parsed_items = parse_bibtex bibtex_out |> List.map Helpers.clean_item in
9294
let options = { default_options with strict = true } in
9395
let formatted = pretty_print_bibtex ~options parsed_items in
9496

doi2bib/lib/doi2bib.ml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
module Http = Http
22
module Parser = Parser
3+
module Helpers = Helpers

doi2bib/lib/dune

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
(library
22
(name doi2bib)
3-
(libraries astring cohttp-lwt-unix clz.cohttp ezxmlm lwt re unix)
3+
(libraries cohttp-lwt-unix clz.cohttp ezxmlm lwt re unix bibfmt)
44
(preprocess future_syntax)
55
(package doi2bib))

doi2bib/lib/helpers.ml

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
let decode_html_entities s =
2+
let replacements =
3+
[
4+
("&", "&");
5+
(""", "\"");
6+
("'", "'");
7+
("©", "(c)");
8+
("®", "(r)");
9+
("™", "(tm)");
10+
(" ", " ");
11+
]
12+
in
13+
List.fold_left
14+
(fun acc (pattern, replacement) ->
15+
let re = Re.compile (Re.str pattern) in
16+
Re.replace_string ~all:true re ~by:replacement acc)
17+
s replacements
18+
19+
let escape_ampersand s =
20+
(* We want to match '&' that is not preceded by '\', but the re library
21+
does not support lookbehind, so we can match either the start of the string or a non-backslash character before '&', keep it and replace
22+
the remaining '&' by '\&' *)
23+
let re =
24+
Re.compile
25+
(Re.seq
26+
[
27+
Re.group (Re.alt [ Re.bos; Re.compl [ Re.char '\\' ] ]); Re.char '&';
28+
])
29+
in
30+
Re.replace re
31+
~f:(fun subs ->
32+
let prefix = Re.Group.get subs 1 in
33+
prefix ^ "\\&")
34+
s
35+
36+
(* The doi API can return html entities more or less everywhere in the fields
37+
content, so we need to replace them. So far we replace some common ones
38+
and make sure to escape the '&'. It can still fail if the entry includes #
39+
or % (this is already treated in URLs), but I'd wait for it to happen
40+
before taking any further action. *)
41+
let clean_string s = s |> decode_html_entities |> escape_ampersand
42+
43+
let clean_field_value =
44+
let open Bibtex in
45+
function
46+
| QuotedStringValue s -> QuotedStringValue (clean_string s)
47+
| BracedStringValue s -> BracedStringValue (clean_string s)
48+
| UnquotedStringValue s -> UnquotedStringValue (clean_string s)
49+
| NumberValue n -> NumberValue n
50+
51+
let clean_item =
52+
let open Bibtex in
53+
function
54+
| Entry e ->
55+
let clean_content = function
56+
| Field f -> Field { f with value = clean_field_value f.value }
57+
| EntryComment c -> EntryComment c
58+
in
59+
Entry { e with contents = List.map clean_content e.contents }
60+
| Comment c -> Comment c

doi2bib/lib/parser.ml

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,19 @@ let string_of_id = function
88
| PubMed s -> "PubMed ID '" ^ s ^ "'"
99

1010
let parse_id id =
11-
let open Astring in
12-
let is_prefix affix s = String.is_prefix ~affix (String.Ascii.lowercase s) in
11+
let is_prefix affix s =
12+
let n = String.length affix in
13+
String.length s >= n && String.sub (String.lowercase_ascii s) 0 n = affix
14+
in
1315
let sub start s =
14-
String.sub ~start s |> String.Sub.to_string |> String.trim
16+
String.trim (String.sub s start (String.length s - start))
1517
in
16-
let contains c s = String.exists (fun c' -> c' = c) s in
1718
match id with
1819
| doi when is_prefix "doi:" doi -> DOI (sub 4 doi)
1920
| arxiv when is_prefix "arxiv:" arxiv -> ArXiv (sub 6 arxiv)
2021
| pubmed when is_prefix "pmc" pubmed -> PubMed pubmed
21-
| doi when contains '/' doi -> DOI (String.trim doi)
22-
| arxiv when contains '.' arxiv -> ArXiv (String.trim arxiv)
22+
| doi when String.contains doi '/' -> DOI (String.trim doi)
23+
| arxiv when String.contains arxiv '.' -> ArXiv (String.trim arxiv)
2324
| _ -> raise (Parse_error id)
2425

2526
let parse_atom id atom =
@@ -42,13 +43,17 @@ let parse_atom id atom =
4243
get_attr "term" a
4344
in
4445
let bibid =
45-
let open Astring in
46-
(match String.cuts ~empty:false ~sep:" " authors with
46+
(match
47+
String.split_on_char ' ' authors |> List.filter (fun s -> s <> "")
48+
with
4749
| _ :: s :: _ -> s
4850
| s :: _ -> s
4951
| [] -> "")
5052
^ year
51-
^ (String.cut ~sep:" " title |> Option.map fst |> Option.value ~default:"")
53+
^
54+
match String.index_opt title ' ' with
55+
| Some i -> String.sub title 0 i
56+
| None -> ""
5257
in
5358
Printf.sprintf
5459
{|@misc{%s,

0 commit comments

Comments
 (0)