1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
private static Regex _tags = new Regex("<[^>]*(>|$)", RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled); private static Regex _whitelist = new Regex(@" ^</?(a|b(lockquote)?|code|em|h(1|2|3)|i|li|ol|p(re)?|s(ub|up|trong|trike)?|ul)>$ |^<(b|h)r\s?/?>$ |^<a[^>]+>$ |^<img[^>]+/?>$", /// <summary> /// sanitize any potentially dangerous tags from the provided raw HTML input using /// a whitelist based approach, leaving the "safe" HTML tags /// </summary> public static string Sanitize(string html) { var tagname = ""; Match tag; var tags = _tags.Matches(html); // iterate through all HTML tags in the input for (int i = tags.Count-1; i > -1; i--) { tag = tags[i]; tagname = tag.Value.ToLower(); if (!_whitelist.IsMatch(tagname)) { // not on our whitelist? I SAY GOOD DAY TO YOU, SIR. GOOD DAY! html = html.Remove(tag.Index, tag.Length); } else if (tagname.StartsWith("<a")) { // detailed <a> tag checking if (!IsMatch(tagname, @"<a\s href=""(https?|ftp)://[^""]+"" (\stitle=""[^""]+"")?\s?>")) { html = html.Remove(tag.Index, tag.Length); } } else if (tagname.StartsWith("<img")) { // detailed <img> tag checking if (!IsMatch(tagname, @"<img\s src=""https?://[^""]+"" (\swidth=""\d{1,3}"")? (\sheight=""\d{1,3}"")? (\salt=""[^""]*"")? (\stitle=""[^""]*"")? \s?/?>")) { html = html.Remove(tag.Index, tag.Length); } } } return html; } /// <summary> /// Utility function to match a regex pattern: case, whitespace, and line insensitive /// </summary> private static bool IsMatch(string s, string pattern) { return Regex.IsMatch(s, pattern, RegexOptions.Singleline | RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture); }
Refactorings
No refactoring yet !
Konrad
June 20, 2008, June 20, 2008 11:35, permalink
“Looking at line 15 on it's own, I can't tell what type "tags" is. A List<Match>? a Match[]? a MatchCollection? IEnumerable<IMatch>? etc.”
– You're not really supposed to care about those implementation details. Otherwise interfaces would be Evil™. All you need to know about 'tags' is that it hosts matches and supports enumeration.
Anyway, line 4 contains an error. RegEx and HTML just don't mix well: what if users nest "<pre>" tags? Of course, they're not supposed to …
Another thing, placeholders are always problematic. What if someone used "%pre%1" in their codes? And of course, you still need to sanitize the contents of "<pre>" tags.
Rob Allen
June 20, 2008, June 20, 2008 13:23, permalink
Here is a blacklist to test your A and IMG tags against which includes HTML comments and all of the JavaScript event handlers. At first blush, I would think splitting the m string on the "=" when a blacklist item is detected and stepping through to scrub might be the way to go. I'll fill in more when I get some time.
1 2 3 4 5 6 7 8 9 10 11 12 13
var blacklist = //JavaScrip events which are valid on IMG and A tags @"<!--|-->|onabort|onblur|onchange|onclick|ondblclick|onerror|onfocus|onkeydown|onkeypress|onkeyup|onload| onmousedown|onmousemove|onmouseout|onmouseover|onmouseup|onreset|onresize|onselect|onsubmit|onunload"; if (!Regex.IsMatch(m.ToString(), blacklist, RegexOptions.Singleline | RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace)) { String[] mSplit = m.split("="); foreach (string str in mSplit) { } }
Nick Berardi
June 20, 2008, June 20, 2008 14:59, permalink
Here is my white list approach. It is to basically strip out all the HTML tags and rebuild the HTML that is put in to the database with my whitelist of tags and attributes. Let me know what you think. The key of the dictionary is the whitelisted tag. The list in the value of the dictionary are allowed attributes.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
private static readonly Regex HtmlTagExpression = new Regex(@"(?'tag_start'</?)(?'tag'\w+)((\s+(?'attr'(?'attr_name'\w+)(\s*=\s*(?:"".*?""|'.*?'|[^'"">\s]+)))?)+\s*|\s*)(?'tag_end'/?>)", RegexOptions.Singleline | RegexOptions.IgnoreCase | RegexOptions.Compiled); private static readonly Regex WhiteSpaceBetweenHtmlTagsExpression = new Regex(@">(/w+)<", RegexOptions.IgnoreCase | RegexOptions.Compiled); private static readonly Regex HtmlLineBreakExpression = new Regex(@"<br(/s+)/>", RegexOptions.IgnoreCase | RegexOptions.Compiled); private static readonly Dictionary<string, List<string>> ValidHtmlTags = new Dictionary<string, List<string>> { { "p", new List<string>() }, { "br", new List<string>() }, { "strong", new List<string>() }, { "b", new List<string>() }, { "em", new List<string>() }, { "i", new List<string>() }, { "u", new List<string>() }, { "strike", new List<string>() }, { "ol", new List<string>() }, { "ul", new List<string>() }, { "li", new List<string>() }, { "a", new List<string> { "href" } }, { "img", new List<string> { "src", "height", "width", "alt" } }, { "q", new List<string> { "cite" } }, { "cite", new List<string>() }, { "abbr", new List<string>() }, { "acronym", new List<string>() }, { "del", new List<string>() }, { "ins", new List<string>() } }; /// <summary> /// Toes the safe HTML. /// </summary> /// <param name="text">The text.</param> /// <returns></returns> public static string ToSafeHtml(this string text) { return text .RemoveInvalidHtmlTags(); } /// <summary> /// Removes the invalid HTML tags. /// </summary> /// <param name="text">The text.</param> /// <returns></returns> public static string RemoveInvalidHtmlTags(this string text) { return HtmlTagExpression.Replace(text, new MatchEvaluator((Match m) => { if (!ValidHtmlTags.ContainsKey(m.Groups["tag"].Value)) return String.Empty; string generatedTag = String.Empty; System.Text.RegularExpressions.Group tagStart = m.Groups["tag_start"]; System.Text.RegularExpressions.Group tagEnd = m.Groups["tag_end"]; System.Text.RegularExpressions.Group tag = m.Groups["tag"]; System.Text.RegularExpressions.Group tagAttributes = m.Groups["attr"]; generatedTag += (tagStart.Success ? tagStart.Value : "<"); generatedTag += tag.Value; foreach (Capture attr in tagAttributes.Captures) { int indexOfEquals = attr.Value.IndexOf('='); // don't proceed any futurer if there is no equal sign or just an equal sign if (indexOfEquals < 1) continue; string attrName = attr.Value.Substring(0, indexOfEquals); // check to see if the attribute name is allowed and write attribute if it is if (ValidHtmlTags[tag.Value].Contains(attrName)) generatedTag += " " + attr.Value; } // add nofollow to all hyperlinks if (tagStart.Success && tagStart.Value == "<" && tag.Value.Equals("a", StringComparison.OrdinalIgnoreCase)) generatedTag += " rel=\"nofollow\""; generatedTag += (tagEnd.Success ? tagEnd.Value : ">"); return generatedTag; })); }
Nick
June 20, 2008, June 20, 2008 15:16, permalink
I just updated it to use StringBuilder. What I like about this method, that I am using, is that I can add special processing in the code, such as adding a rel="nofollow" when an "a" tag is found. I am using this with out much trouble on http://www.ideapipe.com. It is very fast, because for the most part my WYSIWYG editor is limiting most of the HTML anyways. Personally I like the whitelist everything even the attributes approach, because browsers allow you to add onclick, onfocus, onwhatever to every tag, and they will all execute JavaScript.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
return HtmlTagExpression.Replace(text, new MatchEvaluator((Match m) => { if (!ValidHtmlTags.ContainsKey(m.Groups["tag"].Value)) return String.Empty; StringBuilder generatedTag = new StringBuilder(m.Length); System.Text.RegularExpressions.Group tagStart = m.Groups["tag_start"]; System.Text.RegularExpressions.Group tagEnd = m.Groups["tag_end"]; System.Text.RegularExpressions.Group tag = m.Groups["tag"]; System.Text.RegularExpressions.Group tagAttributes = m.Groups["attr"]; generatedTag.Append(tagStart.Success ? tagStart.Value : "<"); generatedTag.Append(tag.Value); foreach (Capture attr in tagAttributes.Captures) { int indexOfEquals = attr.Value.IndexOf('='); // don't proceed any futurer if there is no equal sign or just an equal sign if (indexOfEquals < 1) continue; string attrName = attr.Value.Substring(0, indexOfEquals); // check to see if the attribute name is allowed and write attribute if it is if (ValidHtmlTags[tag.Value].Contains(attrName)) { generatedTag.Append(' '); generatedTag.Append(attr.Value); } } // add nofollow to all hyperlinks if (tagStart.Success && tagStart.Value == "<" && tag.Value.Equals("a", StringComparison.OrdinalIgnoreCase)) generatedTag.Append(" rel=\"nofollow\""); generatedTag.Append(tagEnd.Success ? tagEnd.Value : ">"); return generatedTag.ToString(); }));
Rob Allen
June 20, 2008, June 20, 2008 17:50, permalink
@Jeff, there was a post from Owen on the Stack Overflow blog that linked to this class: http://code.google.com/p/subsonicforums/source/browse/trunk/SubSonic.Forums.Data/HtmlScrubber.cs?r=61 which takes this challenge from the other side of the coin. Instead of stripping out what isn't allowable, it finds the allowable stuff and dumps the rest. For me, that sounds like a more effective way to approach this given the immense number of possible attacks and the relatively small number of safe attributes.
Chris Jester-Young
June 20, 2008, June 20, 2008 22:50, permalink
Reading the ha.ckers page (as linked to from the Stack Overflow blog), I must say, I'm almost ready to give up. :-P The original code (from mental evaluation; I haven't run the code) seems to allow this: <script src="javascript:alert('XSS')" (any src that contains the XSS is fine, as long as the right-angle-bracket is missing)
Also, WRT Nick's solution (which constrains usable attributes), the following exploits are possible: <a href="javascript:alert('XSS')"><img src="javascript:alert('XSS')" /></a> (I don't know if a URI-scheme filter is enough to filter out all URI-based attacks---I'll read further in that page to see....)
OJ
June 21, 2008, June 21, 2008 00:13, permalink
I agree with Rob's last comment. One of the assumptions that's been made so far is that the input contains <em>valid</em> HTML markup. There's nothing stopping people from abusing poor/inavlid markup which may just make it past your validation and end up in the browser -- and depending on the browser you're using you may see a poor attempt at rendering that malformed HTML.
PS. This site is great! I'll be using it to code share/refactor in the future. Cheers!
Chris Jester-Young
June 21, 2008, June 21, 2008 02:57, permalink
The no-closing-angle form of the tag still needs to be caught---search "no closing script tags" in the ha.ckers page, and see the next few entries below it, up to "double open angle brackets". This is especially important if people drop a <script (or <img, or whatever) tag at the end of their text input (and therefore all further tags are outside the scrutiny of the sanitiser).
For example, say your page has this sort of template:
<div class="comment">...</div>
If there is a <script src="..." (or <img src="javascript:..." for IE) at the end of the user input, you'd end up with:
<div class="comment">...<script src="..." </div>
This would trigger "half open HTML" on IE (if the ending tag is an unclosed <img tag), or "double open angle brackets" on Firefox (if the ending tag is an unclosed <script tag). Post two comments, one with each exploit, and you can win both cases. :-P
Chris Jester-Young
June 21, 2008, June 21, 2008 03:15, permalink
Another vulnerability, owing to the fact that System.String.Replace() replaces all instances, not just the first: <foo><scr<foo>ipt src="..." />. Or, perhaps: <foo><scr<foo>ipt>...</scr<foo>ipt>.
One way to solve this would be, rather than strip out bad tags, reject the whole expression entirely when a bad tag is seen, either by throwing exceptions, or (if you're from the Joel Spolsky school of error handling) returning null. :-)
Jeff Atwood
June 21, 2008, June 21, 2008 04:48, permalink
Hi Chris -- those are both *excellent* points, and I verified both. Working on it.. the second one is fairly easy (just replace the one instance) but the unclosed tag thing is rough. Suggestions?
Chris Jester-Young
June 21, 2008, June 21, 2008 05:14, permalink
My initial suggestion would be to use both > and end-of-line as matching end-of-tag, as shown in the Code box. I haven't investigated fully as to whether this actually solves the problem in all cases (perhaps when I'm less tired I can have another go...), but the sample tests I did seemed to work.
1
var tags = Regex.Matches(html, "<[^>]*?(>|$)", RegexOptions.Singleline);
Stefan Ciobaca
June 21, 2008, June 21, 2008 07:01, permalink
[quote]
Hi Chris -- those are both *excellent* points, and I verified both. Working on it.. the second one is fairly easy (just replace the one instance) but the unclosed tag thing is rough. Suggestions?
[/quote]
No, it's not *that* easy. By replacing the first match you introduce another class of (more subtle) vulnerabilities (it's your homework for this weekend to think what it is; hint: it's almost like the bug of replacing everything, but a bit more difficult). The game: "here's my fix; can you find a new bug?" goes on forever. It's ok to want to learn about XSS; the proof that you learned is that you will not allow the use of HTML in the comments.
Garry Shutler
June 21, 2008, June 21, 2008 10:09, permalink
Just a thought. In the example given by someone the comment goes in <div class="comment">...</div>
You could do with a check that makes sure they don't start their comment with </div> and break out of your styling. This would work as part of a deeper check that all tags are in pairs or end />
Nick
June 21, 2008, June 21, 2008 13:59, permalink
Jeff I think we are forgetting about something. You have all of these tags which are valid, but what if somebody copies and pastes from the web? Where <h /> tags and <p /> tags and probably everything else contain a class, style, or etc attribute. You need to strip out the attributes that you don't care about, while keeping the general tag. I provided a solution to this a couple items back.
Jeff Atwood
June 21, 2008, June 21, 2008 16:35, permalink
> but what if somebody copies and pastes from the web
Nick, I don't think that's the use case here.
1) If you are pasting a code sample it'll be inside a <pre><code> block (in the WMD editor, highlight the code snippet and use CTRL+K or press the "code" toolbar button) and thus fully escaped, so it'll appear as-is.
2) If you are composing question or answer text, you'd probably paste from the textual content of the web page, not the view source.
> By replacing the first match you introduce another class of (more subtle) vulnerabilities
You're implying that a later bad tag would somehow appear earlier in the string as a substring, but I cannot for the life of me come up with an example of that working. If a tag is contained within another tag, it'll get stripped by the new tag match that uses either > or $ (end of text) as the end match.
Can you provide an example of this that works, given the new tag match regex? I can't come up with one.
Just in case, you can change the Strip function as below (note: params slightly different, but it's an easy edit), and it will only replace the exact match at the exact location it was originally found. It's a little annoying because we have to modify the original match location with an offset, because the size of the string keeps changing as we remove tags from it that failed our whitelist check..
1 2 3 4 5 6 7 8 9
/// <summary> /// strips tag from the string /// </summary> private static string Strip(Match m, string html, ref int offset) { var loc = m.Index - offset; offset += m.Length; return html.Substring(0, loc) + html.Substring(loc + m.Length); }
Stefan Ciobaca
June 21, 2008, June 21, 2008 22:16, permalink
>You're implying that a later bad tag would somehow appear earlier in the string as
>a substring, but I cannot for the life of me come up with an example of that working.
I imply quite the opposite, really.
<foo<bar><scr<bar>ipt/>
I'm just guessing; since you write C#, I can't actually run your code. I'm supposing something like this will happen:
remove first <foo<bar> ==> <scr<bar>ipt/>
remove first <bar> ==> <script/>
remove first <scr<bar> ==> <script/>
remove first <bar> => <script/>
I'm not sure Matches works like this, because the MSDN documentation on it is at best childish.
Again, a wrong solution to this is, for example, to forbid < in the regexp that searches for tags.
Also, RegexOptions.SingleLine is a terrible name for what is does.
Jon Galloway
June 21, 2008, June 21, 2008 23:25, permalink
I've played with refactoring some code posted on the Html Agility Pack forums (http://www.codeplex.com/htmlagilitypack/Thread/View.aspx?ThreadId=24346) to use a whitelist approach instead of a blacklist. Seems to work well, and it handles tag nesting, too.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
public string ConvertHtml(string html) { HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); //Modified - using a whitelist of allowed elements string[] allowedElements = "a|p|img|pre".Split('|'); HtmlNodeCollection nc = doc.DocumentNode.ChildNodes; //This should work: nc=doc.DocumentNode.SelectNodes("//*[not(a|img|p)]"); if (nc != null) { foreach (HtmlNode node in nc) { if(!allowedElements.Contains(node.Name)) node.ParentNode.RemoveChild(node, false); } } //remove hrefs to java/j/vbscript URLs nc = doc.DocumentNode.SelectNodes("//a[starts-with(@href, 'javascript')]|//a[starts-with(@href, 'jscript')]|//a[starts-with(@href, 'vbscript')]"); if (nc != null) { foreach (HtmlNode node in nc) { node.SetAttributeValue("href", "protected"); } } //remove img with refs to java/j/vbscript URLs nc = doc.DocumentNode.SelectNodes("//img[starts-with(@src, 'javascript')]|//img[starts-with(@src, 'jscript')]|//img[starts-with(@src, 'vbscript')]"); if (nc != null) { foreach (HtmlNode node in nc) { node.SetAttributeValue("src", "protected"); } } //remove on<Event> handlers from all tags nc = doc.DocumentNode.SelectNodes("//*[@onclick or @onmouseover or @onfocus or @onblur or @onmouseout or @ondoubleclick or @onload or @onunload]"); if (nc != null) { foreach (HtmlNode node in nc) { node.Attributes.Remove("onFocus"); node.Attributes.Remove("onBlur"); node.Attributes.Remove("onClick"); node.Attributes.Remove("onMouseOver"); node.Attributes.Remove("onMouseOut"); node.Attributes.Remove("onDoubleClick"); node.Attributes.Remove("onLoad"); node.Attributes.Remove("onUnload"); } } return doc.DocumentNode.WriteTo(); }
Jeff Atwood
June 22, 2008, June 22, 2008 02:29, permalink
> I'm just guessing; since you write C#, I can't actually run your code. I'm supposing something like this will happen:
Well, you don't need C#, you could use a regex testing tool of your choice to see what actually happens. Remember, our intial tag match is "<[^>]*?(>|$)"
"<foo<bar><scr<bar>ipt/>"
remove first <foo<bar> ==> "<scr<bar>ipt/>"
remove first <scr<bar> ==> "ipt/>"
resulting string is "ipt/>".
(and anyway, "first match" is moot; I added 2 lines of code so it actually replaces the correct location for each match now, to prevent any possibility of unwanted side-effect IndexOf() matches.)
Stefan Ciobaca
June 22, 2008, June 22, 2008 07:16, permalink
> Well, you don't need C#, you could use a regex testing tool of your choice to see what
> actually happens. Remember, our intial tag match is "<[^>]*?(>|$)"
The problem here is the Matches functions (which it seems doesn't find overlapping regexps), not the regexp parser ;-)
The code looks ok to me now, but here is an example that does not work as you'd expect:
"<a><" => "a><"
But it does not seem easily exploitable anymore.
What is the ? in the middle of the tag regexp doing there?
Very good idea to ask for community help on this. Very good use of refactormycode.
Chris Jester-Young
June 22, 2008, June 22, 2008 07:53, permalink
@Stefan:
Yes, that does work as expected (I just tested it): "<a><" does get completely stripped out. There would be two matches under this scheme: "<a>", and "<", and neither of which match the allowed whitelist. I dare say it works _because_ it doesn't find overlapping regexes; if it does find overlapping regexes, then the offset-counting mechanism in the 3-arg version of Strip() would be broken.
The ? in the regex does a minimal match (http://msdn.microsoft.com/en-us/library/3206d374.aspx), i.e., it would try to match as few [^>]'s as possible, rather than at most. However, in this case, it's redundant, due to the need to match either a > or end of string right afterwards. Thus "<[^>]*(>|$)" would behave identically to "<[^>]*?(>|$)".
I agree that it's a very good use of RmC...I look forward to seeing more submissions here, especially from the Stack Overflow/Coding Horror readership. :-)
Stefan Ciobaca
June 22, 2008, June 22, 2008 08:49, permalink
Sorry, I meant "<p><". (actually I meant "<a><", because I had the impression "<a>" was allowed)
The result of cleaning "<p><" should be "p><".
Stefan Ciobaca
June 22, 2008, June 22, 2008 09:00, permalink
Here's a good example that passes through from http://ha.ckers.org/xss.html.
1
<IMG SRC="http://www.thesiteyouareon.com/somecommand.php?somevariables=maliciouscode">
Chris Jester-Young
June 22, 2008, June 22, 2008 09:39, permalink
The result of cleaning "<p><" is "<p>". :-)
The attack that your last comment mentions (about valid HTML, with a valid link, pointing to page that causes the server to take a possibly-inappropriate action) is not something for the sanitiser to fix up. Instead, web applications need to take care in validating any requests sent to them. However, you brought up a good point, and it should be discussed.
LiveJournal was stung by something similar a few years ago, and the steps they took to fix it up were very instructional.
1. Any requests that _change state_ on a user's behalf need to check that requests are POST requests, not GET requests.
2. Even POST requests can be "spoofed", as LiveJournal found out. So what LiveJournal changed was that all their POST forms have a timestamp as well as a secure hash of the timestamp (based on a secret key) as hidden fields on the form. If the timestamp is over an hour old, say, or the hash does not verify correctly, the request is rejected.
They have other security measures too, which I no longer recall very clearly. Other LiveJournal users are welcome to shed more light on this. :-)
WRT implementing point 2, .NET has some HMAC classes (up to SHA512 now: http://msdn.microsoft.com/en-us/library/system.security.cryptography.hmacsha512.aspx) which can be very useful for this. (If I remember correctly, to mitigate hash-spoofing attacks even further, LiveJournal also regenerates their keys every few hours, and only the most recent two or so are used for validation. Again, I haven't studied their code for a while, so don't quote me on this. :-P)
Nick
June 22, 2008, June 22, 2008 12:10, permalink
Hi Jeff, point well taken from your last comment. I had forgotten that you were using WMD, I am use to working with the WYSIWYG editors on the web. While looking over your code I did actually have a bug in some of your assumptions. You assume that anchors are going to be written like <a href="..." title="..." />, but what if somebody does <a title="..." href="..." /> or <a href="title" rel="friend" />? It doesn't look like your regex can handle that. Same with the <img /> tag. Except this one has 25 different ways it can be written with just the attributes you allow, and hundreds with all the valid attributes.
I know I have said this before, it was probably missed with all the flurry of comments, but you might want to consider also whitelisting the attributes, so they can appear in any order.
Rob Allen
June 22, 2008, June 22, 2008 18:44, permalink
@Nick
We can handle the order issue by splitting the string then looping the resulting array through as switch - this is written right in the code editor below so take is as an example and assume it doesn't work as typed:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
String[] mArray = m.split(" "); String output = new String(); output = ""; foreach (String str in mArray) { String item = str.ToLower().substring(0,4); switch (item){ case ("src="): output += item; break; case ("href"): output += item; break; case ("titl"): output += item; break; case ("rel="): output += item; break; } }
Jonathan Holland
June 22, 2008, June 22, 2008 20:31, permalink
I used a fairly simple method on my own internal blog engine. It used a blacklist to remove tags that I did not want, its not as robust as your solution but it worked well for me:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
/// <summary> /// Escapes HTML entities that are vulnerable to attack /// </summary> /// <param name="commentText">Comment Text</param> /// <returns>Escaped Comment Text</returns> public static string EscapeMarkup(string commentText) { string returnText = string.Empty; // Remove HTML Comments commentText = commentText.Replace("<!--", ""); // strip out comments and doctype Regex docType = new Regex("<!DOCTYPE[.]*>"); returnText = docType.Replace(commentText, ""); commentText = Regex.Replace(commentText, "(.*)", @"$5"); // strip out most known tags except: // (a|b|br|blockquote|em|h1|h2|h3|h4|h5|h6|hr|i|li|ol|p|u|ul|strong|sub|sup) Regex badTags = new Regex("< [/]{0,1}(abbr|acronym|address|applet|area|base|basefont|bdo|big|body|button|caption|center|cite|code|col|colgroup|dd|del|dir|div|dfn|dl|dt|embed|fieldset|font|form|frame|frameset|head|html|iframe|img|input|ins|isindex|kbd|label|legend|link|map|menu|meta|noframes|noscript|object|optgroup|option|param|pre|q|s|samp|script|select|small|span|strike|style|table|tbody|td|textarea|tfoot|th|thead|title|tr|tt|var|xmp){1}[.]*>",RegexOptions.IgnoreCase); returnText = badTags.Replace(returnText, ""); // Convert line breaks into <BR/> tags returnText = returnText.Replace(Environment.NewLine,"<br/>"); return returnText; }
Chris Jester-Young
June 22, 2008, June 22, 2008 23:14, permalink
@Nick: The way Jeff has it is actually some sort of secure. To actually allow all sorts of reordering would either require a more complicated regex (e.g., <img( src="https?://[^"]+"| alt="[^"]*"| title="[^"]*")* ?/?> (I left out width and height on purpose, because I don't think people should be able to say width="5000"), which then has a risk that people might not put in a src attribute (harmless but annoying)), or use some sort of attribute-parsing system.
I don't like the latter idea at all: the code Jeff had has one very important virtue: it's simple, and thus easier to vet than most of the others I've seen here. I would be unhappy to see the code depart from that simplicity.
Remember that the regex enforces the use of double-quotes as attribute delimiter (not single quotes, not backquotes, and definitely not no quotes), with no spaces around the equal sign, and lower-case attribute names only. People who use an attribute-parsing system might be tempted to relax on any of those. Lenient HTML == vulnerable HTML.
@Rob: Your system might work but the matching must be done using a regex, not just the first 4 chars. In particular the src must still require a http:/https: scheme (and definitely not let through javascript: URIs).
Chris Jester-Young
June 22, 2008, June 22, 2008 23:30, permalink
@Jonathan: New elements come out as standards evolve, but more importantly, each browser has its own set of proprietary tags. Keeping up with the list of blacklisted elements is a Sisyphean task.
Also, I just tested your program, it doesn't seem to strip any of my test cases at all. What test cases do you use for your function? I'd be happy to send you some of mine.
Nick
June 23, 2008, June 23, 2008 00:35, permalink
@Chris: it is actually simpler than you make it out to be, you don't actually create an expression for each tag you want to check, you just create one expression that will strip out everything you need. And then you reconstruct the tag using a white list of attributes. I have posted the regex below. I don't want to spam the board with my code but you can find the original here http://refactormycode.com/codes/333-sanitize-html#refactor_11281 that will do all this for you. Very simple and easy to implement code.
Also @Chris you mentioned some very trivial things such as case and quote use, both can easily be corrected when you can pull out the attributes and values from tags. I am reconstructing the tag with out the blacklisted attributes so I can just as easily call ToLower on the attribute and wrap it with quotes.
Remember you are not trying to make the system a burden the users, if they enter in a link tag with some attributes that don't fall in white list you shouldn't just ignore the whole link, because 99 times out of 100 they didn't really care about the extra attribute but they most definitely cared about the link. Same can be argued for <blockquote /> where somebody might use the cite attribute.
Remember the purpose of this is to sanitize the HTML, not provide an iron fist. An Iron fist will just confuse the user as to why their link was not entered in to the system.
1
@"(?'tag_start'</?)(?'tag'\w+)((\s+(?'attr'(?'attr_name'\w+)(\s*=\s*(?:"".*?""|'.*?'|[^'"">\s]+)))?)+\s*|\s*)(?'tag_end'/?>)"
Rob Allen
June 23, 2008, June 23, 2008 12:40, permalink
@Chris - thanks for the feedback. My hacker hat was in the wash when I wrote that and I completely forgot that IMG Src attributes can come in many forms - not the least of which are scripts with mime headers which I have used in the past to serve images from a database (i.e., <img src="image.php?pid=12"/>)
1
<img src="image.php?pid=12"/>
mmp1
June 23, 2008, June 23, 2008 12:46, permalink
All the regex stuff looks really small and compact code wise, but you are basically trying to write a simple HTML parser in regex. Also, the code might look small, but it might not be the fastest code. ideas to consider->since you are .NET (I assume you are using an MS .net and not say MONO on linux) why not use say lightweight MSHTML, get a DOM model and then iter. over the DOM. (or another HTML parser with a DOM type model). Then pull out the tags and attribs you want using the whitelist. Once you have it in the dom its easy to do other things (and run transforms etc). Also, only have to parse the file once (not mulitple times). It might seem a little bit of work, but it might be the most bullet proof.
(see here for lightweight mshtml example. http://www.codeguru.com/cpp/i-n/ieprogram/article.php/c4385/#more ). Note sure how this would handl high loads (ie. any memory leaks etc).
Or maybe this is more of a c-programmers solution ;-) (write a parser, use a data structure....etc).
Rob Allen
June 23, 2008, June 23, 2008 14:42, permalink
@mmp
Since this script is for guarding against XSS attacks in forum/blog/wiki posts, accessing the DOM is too late in the process. This script would be run before the content of the request is posted to the DB and the dom doesn't have the new data for it yet.
Mike H
June 23, 2008, June 23, 2008 17:05, permalink
Why not iterate your match collection in reverse to avoid the index/offset code? You can then just use String.Remove() and axe the Strip() method, whose purpose is just to maintain index shortener.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
/// <summary> /// sanitize any potentially dangerous tags from the provided raw HTML input using /// a whitelist based approach, leaving the "safe" HTML tags /// </summary> public static string SanitizeHtml(string html) { // Match all html tags var tags = Regex.Matches(html, "<[^>]*(>|$)", RegexOptions.Singleline); var whitelist = @"</?p>|<br\s?/?>|</?b>|</?strong>|</?i>|</?em>|</?s>|</?strike>| </?blockquote>|</?sub>|</?super>|</?h(1|2|3)>|</?pre>|<hr\s?/?>| </?code>|</?ul>|</?ol>|</?li>| </a>|<a[^>]+>|<img[^>]+/?>"; // loop through each matched tag for (Int32 i=tags.Count-1; i>=0; i--) { Match tag = tags[i]; if (!IsMatch(tag.Value, whitelist)) { // not on our whitelist? I SAY GOOD DAY TO YOU, SIR. GOOD DAY! html = html.Remove(tag.Index, tag.Length); } else if (IsMatch(tag.Value, "^<a")) { // detailed <a> tag checking if (!IsMatch(tag.Value, @"<a\s href=""(https?|ftp)://[^""]+"" (\stitle=""[^""]+"")?\s?>")) { html = html.Remove(tag.Index, tag.Length); } } else if (IsMatch(tag.Value, "^<img")) { // detailed <img> tag checking if (!IsMatch(tag.Value, @"<img\s src=""https?://[^""]+"" (\swidth=""[^""]*"")? (\sheight=""[^""]*"")? (\salt=""[^""]*"")? (\stitle=""[^""]*"")? \s?/>")) { html = html.Remove(tag.Index, tag.Length); } } } return html; }
Stefan Ciobaca
June 23, 2008, June 23, 2008 17:38, permalink
> Why not iterate your match collection in reverse to avoid the index/offset code?
Yes, a lot cleaner.
> The result of cleaning "<p><" is "<p>". :-)
Not in the version where you remove the first occurence of the bad tag.
> The attack that your
Takes a provided HTML string and removes any potentially dangerous XSS HTML tags using a whitelist approach. Useful when you want to allow a small subset of "safe" HTML tags in user content.
UPDATED: July 11th to reflect all refactorings, plus optimizing for speed. Now 2x faster!
UPDATED: Aug 26th, bugfixes