Skip to content

Commit 1a7dcfd

Browse files
authored
Merge pull request #7838 from plotly/cam/avoid-decoding-unsafe-characters-tosvg
fix: Preserve XML structural entities during decode
2 parents c8cccb0 + 67cd1de commit 1a7dcfd

3 files changed

Lines changed: 512 additions & 398 deletions

File tree

draftlogs/7838_fix.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
- Preserve XML structural entities during decode when exporting SVG [[#7838](https://github.com/plotly/plotly.js/pull/7838)]

src/snapshot/tosvg.js

Lines changed: 52 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,39 @@ var Color = require('../components/color');
99
var xmlnsNamespaces = require('../constants/xmlns_namespaces');
1010
var DOUBLEQUOTE_REGEX = /"/g;
1111
var DUMMY_SUB = 'TOBESTRIPPED';
12-
var DUMMY_REGEX = new RegExp('("' + DUMMY_SUB + ')|(' + DUMMY_SUB + '")', 'g');
13-
12+
// Match TOBESTRIPPED adjacent to either a literal " or its entity form ".
13+
// XMLSerializer escapes inner double-quotes to " inside "-delimited
14+
// attributes, and htmlEntityDecode now preserves that entity for safety.
15+
const DUMMY_REGEX = new RegExp(`("${DUMMY_SUB})|(${DUMMY_SUB}")|("${DUMMY_SUB})|(${DUMMY_SUB}")`, 'g');
16+
17+
// Entities for & " ' - decoding these in attribute context is an XSS vector,
18+
// so preserve them as-is. List includes named, decimal, and hex numeric forms.
19+
const PRESERVED_ENTITIES = ['&', '&', '&', '"', '"', '"', ''', ''', '''];
20+
// Entities for < and > - normalize to numeric so downstream passes treat them
21+
// uniformly regardless of which form the serializer emitted.
22+
const LESS_THAN_ENTITIES = ['&lt;', '&#60;', '&#x3c;'];
23+
const GREATER_THAN_ENTITIES = ['&gt;', '&#62;', '&#x3e;'];
24+
25+
/**
26+
* Decode non-structural entities to Unicode for non-browser SVG renderers,
27+
* keeping & " ' < > entity-encoded to prevent attribute-context escape (XSS).
28+
*
29+
* @param s - serialized SVG string
30+
* @returns entity-normalized SVG string
31+
*/
1432
function htmlEntityDecode(s) {
15-
var hiddenDiv = d3.select('body').append('div').style({display: 'none'}).html('');
16-
var replaced = s.replace(/(&[^;]*;)/gi, function(d) {
17-
if(d === '&lt;') { return '&#60;'; } // special handling for brackets
18-
if(d === '&rt;') { return '&#62;'; }
19-
if(d.indexOf('<') !== -1 || d.indexOf('>') !== -1) { return ''; }
33+
const hiddenDiv = d3.select('body').append('div').style({ display: 'none' }).html('');
34+
const replaced = s.replace(/(&[^;]*;)/gi, (d) => {
35+
const lower = d.toLowerCase();
36+
if (PRESERVED_ENTITIES.includes(lower)) return d;
37+
if (LESS_THAN_ENTITIES.includes(lower)) return '&#60;';
38+
if (GREATER_THAN_ENTITIES.includes(lower)) return '&#62;';
39+
if (d.includes('<') || d.includes('>')) return '';
40+
2041
return hiddenDiv.html(d).text(); // everything else, let the browser decode it to unicode
2142
});
2243
hiddenDiv.remove();
44+
2345
return replaced;
2446
}
2547

@@ -48,29 +70,29 @@ module.exports = function toSVG(gd, format, scale) {
4870
// which notably add the contents of the gl-container
4971
// into the main svg node
5072
var basePlotModules = fullLayout._basePlotModules || [];
51-
for(i = 0; i < basePlotModules.length; i++) {
73+
for (i = 0; i < basePlotModules.length; i++) {
5274
var _module = basePlotModules[i];
5375

54-
if(_module.toSVG) _module.toSVG(gd);
76+
if (_module.toSVG) _module.toSVG(gd);
5577
}
5678

5779
// add top items above them assumes everything in toppaper is either
5880
// a group or a defs, and if it's empty (like hoverlayer) we can ignore it.
59-
if(toppaper) {
81+
if (toppaper) {
6082
var nodes = toppaper.node().childNodes;
6183

6284
// make copy of nodes as childNodes prop gets mutated in loop below
6385
var topGroups = Array.prototype.slice.call(nodes);
6486

65-
for(i = 0; i < topGroups.length; i++) {
87+
for (i = 0; i < topGroups.length; i++) {
6688
var topGroup = topGroups[i];
6789

68-
if(topGroup.childNodes.length) svg.node().appendChild(topGroup);
90+
if (topGroup.childNodes.length) svg.node().appendChild(topGroup);
6991
}
7092
}
7193

7294
// remove draglayer for Adobe Illustrator compatibility
73-
if(fullLayout._draggers) {
95+
if (fullLayout._draggers) {
7496
fullLayout._draggers.remove();
7597
}
7698

@@ -80,81 +102,82 @@ module.exports = function toSVG(gd, format, scale) {
80102
svg.node().style.background = '';
81103

82104
svg.selectAll('text')
83-
.attr({'data-unformatted': null, 'data-math': null})
84-
.each(function() {
105+
.attr({ 'data-unformatted': null, 'data-math': null })
106+
.each(function () {
85107
var txt = d3.select(this);
86108

87109
// hidden text is pre-formatting mathjax, the browser ignores it
88110
// but in a static plot it's useless and it can confuse batik
89111
// we've tried to standardize on display:none but make sure we still
90112
// catch visibility:hidden if it ever arises
91-
if(this.style.visibility === 'hidden' || this.style.display === 'none') {
113+
if (this.style.visibility === 'hidden' || this.style.display === 'none') {
92114
txt.remove();
93115
return;
94116
} else {
95117
// clear other visibility/display values to default
96118
// to not potentially confuse non-browser SVG implementations
97-
txt.style({visibility: null, display: null});
119+
txt.style({ visibility: null, display: null });
98120
}
99121

100122
// Font family styles break things because of quotation marks,
101123
// so we must remove them *after* the SVG DOM has been serialized
102124
// to a string (browsers convert singles back)
103125
var ff = this.style.fontFamily;
104-
if(ff && ff.indexOf('"') !== -1) {
126+
if (ff && ff.indexOf('"') !== -1) {
105127
txt.style('font-family', ff.replace(DOUBLEQUOTE_REGEX, DUMMY_SUB));
106128
}
107129

108130
// Drop normal font-weight, font-style and font-variant to reduce the size
109131
var fw = this.style.fontWeight;
110-
if(fw && (fw === 'normal' || fw === '400')) { // font-weight 400 is similar to normal
132+
if (fw && (fw === 'normal' || fw === '400')) {
133+
// font-weight 400 is similar to normal
111134
txt.style('font-weight', undefined);
112135
}
113136
var fs = this.style.fontStyle;
114-
if(fs && fs === 'normal') {
137+
if (fs && fs === 'normal') {
115138
txt.style('font-style', undefined);
116139
}
117140
var fv = this.style.fontVariant;
118-
if(fv && fv === 'normal') {
141+
if (fv && fv === 'normal') {
119142
txt.style('font-variant', undefined);
120143
}
121144
});
122145

123-
svg.selectAll('.gradient_filled,.pattern_filled').each(function() {
146+
svg.selectAll('.gradient_filled,.pattern_filled').each(function () {
124147
var pt = d3.select(this);
125148

126149
// similar to font family styles above,
127150
// we must remove " after the SVG DOM has been serialized
128151
var fill = this.style.fill;
129-
if(fill && fill.indexOf('url(') !== -1) {
152+
if (fill && fill.indexOf('url(') !== -1) {
130153
pt.style('fill', fill.replace(DOUBLEQUOTE_REGEX, DUMMY_SUB));
131154
}
132155

133156
var stroke = this.style.stroke;
134-
if(stroke && stroke.indexOf('url(') !== -1) {
157+
if (stroke && stroke.indexOf('url(') !== -1) {
135158
pt.style('stroke', stroke.replace(DOUBLEQUOTE_REGEX, DUMMY_SUB));
136159
}
137160
});
138161

139-
if(format === 'pdf' || format === 'eps') {
162+
if (format === 'pdf' || format === 'eps') {
140163
// these formats make the extra line MathJax adds around symbols look super thick in some cases
141164
// it looks better if this is removed entirely.
142-
svg.selectAll('#MathJax_SVG_glyphs path')
143-
.attr('stroke-width', 0);
165+
svg.selectAll('#MathJax_SVG_glyphs path').attr('stroke-width', 0);
144166
}
145167

146-
if(format === 'svg' && scale) {
168+
if (format === 'svg' && scale) {
147169
svg.attr('width', scale * width);
148170
svg.attr('height', scale * height);
149171
svg.attr('viewBox', '0 0 ' + width + ' ' + height);
150172
}
151173

152174
var s = new window.XMLSerializer().serializeToString(svg.node());
175+
// Decode numeric refs to Unicode so non-browser renderers (Batik, Illustrator) render them correctly.
153176
s = htmlEntityDecode(s);
154177
s = xmlEntityEncode(s);
155178

156179
// Fix quotations around font strings and gradient URLs
157-
s = s.replace(DUMMY_REGEX, '\'');
180+
s = s.replace(DUMMY_REGEX, "'");
158181

159182
return s;
160183
};

0 commit comments

Comments
 (0)