Skip to content

Commit f70d90d

Browse files
committed
fix(xmp): Correctly parse XMP with self-closing elements (#5106)
decode_xmp() only searched for `</rdf:Description>` end tags, so self-closing `<rdf:Description ... />` elements (valid XML) were silently skipped. Now detect both `/>` and `</rdf:Description>` endings, plus simplify quite a bit generally, by rewriting decode_xmp to use pugixml more thoroughly to do the whole parse job instead of trying to hunt for tags ourselves. Fixes #5021 Assisted-by: Claude Code / Opus 4.6 --------- Signed-off-by: Larry Gritz <lg@larrygritz.com>
1 parent 1d2f1d5 commit f70d90d

1 file changed

Lines changed: 26 additions & 45 deletions

File tree

src/libOpenImageIO/xmp.cpp

Lines changed: 26 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -408,25 +408,6 @@ add_attrib(ImageSpec& spec, string_view xmlname, string_view xmlvalue,
408408

409409

410410

411-
// Utility: Search str for the first substring in str (starting from
412-
// position pos) that starts with startmarker and ends with endmarker.
413-
// If not found, return false. If found, return true, store the
414-
// beginning and ending indices in startpos and endpos.
415-
static bool
416-
extract_middle(string_view str, size_t pos, string_view startmarker,
417-
string_view endmarker, size_t& startpos, size_t& endpos)
418-
{
419-
startpos = str.find(startmarker, pos);
420-
if (startpos == std::string::npos)
421-
return false; // start marker not found
422-
endpos = str.find(endmarker, startpos);
423-
if (endpos == std::string::npos)
424-
return false; // end marker not found
425-
endpos += endmarker.size();
426-
return true;
427-
}
428-
429-
430411
// Decode one XMP node and its children.
431412
// Return value is the size of the resulting attribute (can be used to
432413
// catch runaway or corrupt XML).
@@ -536,35 +517,35 @@ decode_xmp(string_view xml, ImageSpec& spec)
536517
#endif
537518
if (!xml.length())
538519
return true;
539-
for (size_t startpos = 0, endpos = 0;
540-
extract_middle(xml, endpos, "<rdf:Description", "</rdf:Description>",
541-
startpos, endpos);) {
542-
// Turn that middle section into an XML document
543-
string_view rdf = xml.substr(startpos, endpos - startpos); // scooch in
544-
#if DEBUG_XMP_READ
545-
std::cerr << "RDF is:\n---\n" << rdf.substr(0, 4096) << "\n---\n";
546-
#endif
547-
pugi::xml_document doc;
548-
pugi::xml_parse_result parse_result
549-
= doc.load_buffer(rdf.data(), rdf.size(),
550-
pugi::parse_default | pugi::parse_fragment);
551-
if (!parse_result) {
520+
// Some callers (e.g. JPEG APP1 markers) prepend a namespace URI before the
521+
// actual XML. Skip any leading non-XML content.
522+
auto xmlstart = xml.find('<');
523+
if (xmlstart == string_view::npos)
524+
return true;
525+
xml = xml.substr(xmlstart);
526+
pugi::xml_document doc;
527+
pugi::xml_parse_result parse_result
528+
= doc.load_buffer(xml.data(), xml.size(),
529+
pugi::parse_default | pugi::parse_fragment);
530+
if (!parse_result) {
552531
#if DEBUG_XMP_READ
553-
std::cerr << "Error parsing XML @" << parse_result.offset << ": "
554-
<< parse_result.description() << "\n";
532+
std::cerr << "Error parsing XML @" << parse_result.offset << ": "
533+
<< parse_result.description() << "\n";
555534
#endif
556-
// Instead of returning early here if there were errors parsing
557-
// the XML -- I have noticed that very minor XML malformations
558-
// are common in XMP found in files -- hope for the best and
559-
// go ahead and assume that maybe it managed to put something
560-
// useful in the resulting document.
561-
#if 0
562-
return true;
563-
#endif
564-
}
565-
// Decode the contents of the XML document (it will recurse)
566-
decode_xmp_node(doc.first_child(), spec);
535+
// Instead of returning early here if there were errors parsing
536+
// the XML -- I have noticed that very minor XML malformations
537+
// are common in XMP found in files -- hope for the best and
538+
// go ahead and assume that maybe it managed to put something
539+
// useful in the resulting document.
567540
}
541+
// Find the first rdf:Description node anywhere in the document.
542+
// decode_xmp_node() iterates siblings, so passing the first one processes
543+
// all rdf:Description siblings (i.e. all blocks in the rdf:RDF container).
544+
auto first_desc = doc.find_node([](pugi::xml_node n) {
545+
return strcmp(n.name(), "rdf:Description") == 0;
546+
});
547+
if (first_desc)
548+
decode_xmp_node(first_desc, spec);
568549
#if DEBUG_XMP_READ
569550
std::cerr << "XMP total parse time " << timer() << "\n";
570551
#endif

0 commit comments

Comments
 (0)