Map the weird numeric escapes to textual ones

This commit is contained in:
2025-03-26 02:34:21 +01:00
parent 2bfd9f951e
commit 1a4b4f76f2
2 changed files with 141 additions and 21 deletions

View File

@@ -84,10 +84,11 @@ func (p *XMLProcessor) ProcessContent(content string, path string, luaExpr strin
declaration := doc.FirstChild.OutputXML(true) declaration := doc.FirstChild.OutputXML(true)
// Remove the firstChild (declaration) before serializing the rest of the document // Remove the firstChild (declaration) before serializing the rest of the document
doc.FirstChild = doc.FirstChild.NextSibling doc.FirstChild = doc.FirstChild.NextSibling
return declaration + doc.OutputXML(true), modCount, matchCount, nil return ConvertToNamedEntities(declaration + doc.OutputXML(true)), modCount, matchCount, nil
} }
return doc.OutputXML(true), modCount, matchCount, nil // Convert numeric entities to named entities for better readability
return ConvertToNamedEntities(doc.OutputXML(true)), modCount, matchCount, nil
} }
// ToLua converts XML node values to Lua variables // ToLua converts XML node values to Lua variables
@@ -268,3 +269,127 @@ func nodeTypeName(nodeType xmlquery.NodeType) string {
return "unknown" return "unknown"
} }
} }
// ConvertToNamedEntities replaces numeric XML entities with their named counterparts
func ConvertToNamedEntities(xml string) string {
// Basic XML entities
replacements := map[string]string{
// Basic XML entities
""": """, // double quote
"'": "'", // single quote
"<": "<", // less than
">": ">", // greater than
"&": "&", // ampersand
// Common symbols
" ": " ", // non-breaking space
"©": "©", // copyright
"®": "®", // registered trademark
"€": "€", // euro
"£": "£", // pound
"¥": "¥", // yen
"¢": "¢", // cent
"§": "§", // section
"™": "™", // trademark
"♠": "♠", // spade
"♣": "♣", // club
"♥": "♥", // heart
"♦": "♦", // diamond
// Special characters
"¡": "¡", // inverted exclamation
"¿": "¿", // inverted question
"«": "«", // left angle quotes
"»": "»", // right angle quotes
"·": "·", // middle dot
"•": "•", // bullet
"…": "…", // horizontal ellipsis
"′": "′", // prime
"″": "″", // double prime
"‾": "‾", // overline
"⁄": "⁄", // fraction slash
// Math symbols
"±": "±", // plus-minus
"×": "×", // multiplication
"÷": "÷", // division
"∞": "∞", // infinity
"≈": "≈", // almost equal
"≠": "≠", // not equal
"≤": "≤", // less than or equal
"≥": "≥", // greater than or equal
"∑": "∑", // summation
"√": "√", // square root
"∫": "∫", // integral
// Accented characters
"À": "À", // A grave
"Á": "Á", // A acute
"Â": "Â", // A circumflex
"Ã": "Ã", // A tilde
"Ä": "Ä", // A umlaut
"Å": "Å", // A ring
"Æ": "Æ", // AE ligature
"Ç": "Ç", // C cedilla
"È": "È", // E grave
"É": "É", // E acute
"Ê": "Ê", // E circumflex
"Ë": "Ë", // E umlaut
"Ì": "Ì", // I grave
"Í": "Í", // I acute
"Î": "Î", // I circumflex
"Ï": "Ï", // I umlaut
"Ð": "Ð", // Eth
"Ñ": "Ñ", // N tilde
"Ò": "Ò", // O grave
"Ó": "Ó", // O acute
"Ô": "Ô", // O circumflex
"Õ": "Õ", // O tilde
"Ö": "Ö", // O umlaut
"Ø": "Ø", // O slash
"Ù": "Ù", // U grave
"Ú": "Ú", // U acute
"Û": "Û", // U circumflex
"Ü": "Ü", // U umlaut
"Ý": "Ý", // Y acute
"Þ": "Þ", // Thorn
"ß": "ß", // Sharp s
"à": "à", // a grave
"á": "á", // a acute
"â": "â", // a circumflex
"ã": "ã", // a tilde
"ä": "ä", // a umlaut
"å": "å", // a ring
"æ": "æ", // ae ligature
"ç": "ç", // c cedilla
"è": "è", // e grave
"é": "é", // e acute
"ê": "ê", // e circumflex
"ë": "ë", // e umlaut
"ì": "ì", // i grave
"í": "í", // i acute
"î": "î", // i circumflex
"ï": "ï", // i umlaut
"ð": "ð", // eth
"ñ": "ñ", // n tilde
"ò": "ò", // o grave
"ó": "ó", // o acute
"ô": "ô", // o circumflex
"õ": "õ", // o tilde
"ö": "ö", // o umlaut
"ø": "ø", // o slash
"ù": "ù", // u grave
"ú": "ú", // u acute
"û": "û", // u circumflex
"ü": "ü", // u umlaut
"ý": "ý", // y acute
"þ": "þ", // thorn
"ÿ": "ÿ", // y umlaut
}
result := xml
for numeric, named := range replacements {
result = strings.ReplaceAll(result, numeric, named)
}
return result
}

View File

@@ -17,12 +17,7 @@ func normalizeXMLWhitespace(s string) string {
s = re.ReplaceAllString(strings.TrimSpace(s), " ") s = re.ReplaceAllString(strings.TrimSpace(s), " ")
// Normalize XML entities for comparison // Normalize XML entities for comparison
s = strings.ReplaceAll(s, "'", "'") s = ConvertToNamedEntities(s)
s = strings.ReplaceAll(s, """, """)
s = strings.ReplaceAll(s, """, "\"")
s = strings.ReplaceAll(s, "&lt;", "<")
s = strings.ReplaceAll(s, "&gt;", ">")
s = strings.ReplaceAll(s, "&amp;", "&")
return s return s
} }
@@ -52,7 +47,7 @@ func TestXMLProcessor_Process_NodeValues(t *testing.T) {
<catalog> <catalog>
<book id="bk101"> <book id="bk101">
<author>Gambardella, Matthew</author> <author>Gambardella, Matthew</author>
<title>XML Developer's Guide</title> <title>XML Developer&apos;s Guide</title>
<genre>Computer</genre> <genre>Computer</genre>
<price>89.9</price> <price>89.9</price>
<publish_date>2000-10-01</publish_date> <publish_date>2000-10-01</publish_date>
@@ -1015,9 +1010,9 @@ func TestXMLProcessor_Process_ElementReordering(t *testing.T) {
luaExpr := ` luaExpr := `
-- With table approach, we can reorder elements by redefining the table -- With table approach, we can reorder elements by redefining the table
-- Store the values -- Store the values
local artist = v.artist local artist = v.attr.artist
local title = v.title local title = v.attr.title
local year = v.year local year = v.attr.year
-- Clear the table -- Clear the table
for k in pairs(v) do for k in pairs(v) do
@@ -1025,9 +1020,9 @@ func TestXMLProcessor_Process_ElementReordering(t *testing.T) {
end end
-- Add elements in the desired order -- Add elements in the desired order
v.title = title v.attr.title = title
v.artist = artist v.attr.artist = artist
v.year = year v.attr.year = year
` `
result, modCount, matchCount, err := p.ProcessContent(content, "//song", luaExpr) result, modCount, matchCount, err := p.ProcessContent(content, "//song", luaExpr)
@@ -1178,13 +1173,13 @@ func TestXMLProcessor_Process_DynamicXPath(t *testing.T) {
expected := `<?xml version="1.0" encoding="UTF-8"?> expected := `<?xml version="1.0" encoding="UTF-8"?>
<configuration> <configuration>
<settings> <settings>
<setting name="timeout" value="60" /> <setting name="timeout" value="60"></setting>
<setting name="retries" value="3" /> <setting name="retries" value="3"></setting>
<setting name="backoff" value="exponential" /> <setting name="backoff" value="exponential"></setting>
</settings> </settings>
<advanced> <advanced>
<setting name="logging" value="debug" /> <setting name="logging" value="debug"></setting>
<setting name="timeout" value="120" /> <setting name="timeout" value="120"></setting>
</advanced> </advanced>
</configuration>` </configuration>`
@@ -1192,7 +1187,7 @@ func TestXMLProcessor_Process_DynamicXPath(t *testing.T) {
p := &XMLProcessor{} p := &XMLProcessor{}
// Double all timeout values in the configuration // Double all timeout values in the configuration
result, modCount, matchCount, err := p.ProcessContent(content, "//setting[@name='timeout']/@value", "v = v * 2") result, modCount, matchCount, err := p.ProcessContent(content, "//setting[@name='timeout']/@value", "v.value = v.value * 2")
if err != nil { if err != nil {
t.Fatalf("Error processing content: %v", err) t.Fatalf("Error processing content: %v", err)