From 15652bd905a6c9dda59a81b14d4766adbbae2ea8 Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Fri, 8 May 2026 12:26:36 +0000 Subject: [PATCH] fix: validate UTF-8 continuation bytes in domParseChar to prevent OOB read domParseChar() read continuation bytes for multi-byte UTF-8 sequences without verifying they actually exist or are valid. A truncated sequence like "a\xF0" caused reads past the NUL terminator into uninitialized heap memory. The caller LibXML_test_node_name() then advanced its pointer by the (wrong) reported length, continuing to read from uncontrolled heap until hitting a zero byte or unmapped memory. Add validation that each continuation byte has the 10xxxxxx form before reading it, matching libxml2's own xmlCurrentChar() behavior. Invalid sequences now return 0 with *len = -1. Fixes #146 Co-Authored-By: Claude Opus 4.6 --- dom.c | 18 ++++++++++++++++++ t/06elements.t | 10 +++++++--- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/dom.c b/dom.c index 94518b0a..d86ed4a0 100644 --- a/dom.c +++ b/dom.c @@ -294,6 +294,13 @@ domParseChar( xmlChar *cur, int *len ) if ((c & 0xe0) == 0xe0) { if ((c & 0xf0) == 0xf0) { /* 4-byte code */ + if ((cur[1] & 0xC0) != 0x80 || + (cur[2] & 0xC0) != 0x80 || + (cur[3] & 0xC0) != 0x80) + { + *len = -1; + return(0); + } *len = 4; val = (cur[0] & 0x7) << 18; val |= (cur[1] & 0x3f) << 12; @@ -301,6 +308,12 @@ domParseChar( xmlChar *cur, int *len ) val |= cur[3] & 0x3f; } else { /* 3-byte code */ + if ((cur[1] & 0xC0) != 0x80 || + (cur[2] & 0xC0) != 0x80) + { + *len = -1; + return(0); + } *len = 3; val = (cur[0] & 0xf) << 12; val |= (cur[1] & 0x3f) << 6; @@ -308,6 +321,11 @@ domParseChar( xmlChar *cur, int *len ) } } else { /* 2-byte code */ + if ((cur[1] & 0xC0) != 0x80) + { + *len = -1; + return(0); + } *len = 2; val = (cur[0] & 0x1f) << 6; val |= cur[1] & 0x3f; diff --git a/t/06elements.t b/t/06elements.t index 84fedbbe..b2e7faf0 100644 --- a/t/06elements.t +++ b/t/06elements.t @@ -7,7 +7,7 @@ use strict; use warnings; # Should be 187. -use Test::More tests => 191; +use Test::More tests => 200; use XML::LibXML; @@ -21,8 +21,12 @@ my $attname2 = "B"; my $attvalue2 = "b"; my $attname3 = "C"; -# TEST:$badnames=4; -my @badnames= ("1A", "<><", "&", "-:"); +# TEST:$badnames=7; +my @badnames= ("1A", "<><", "&", "-:", + "a\xF0", # truncated 4-byte UTF-8 + "a\xE0", # truncated 3-byte UTF-8 + "a\xC0", # truncated 2-byte UTF-8 +); # 1. bound node {