You are here: Home > Latest news from Darcs > Imports changes from Mediacloth revisions r132-137

Revision 20080707203723-9043f-f95ae3...

Imports changes from Mediacloth revisions r132-137

Partially fixes problems with script tags

vendor/mediacloth-trunk/lib/mediacloth/mediawikilexer.rb
vendor/mediacloth-trunk/test/htmlgenerator.rb
vendor/mediacloth-trunk/test/lexer.rb
vendor/mediacloth-trunk/test/sanitization.rb

Changes to mediawikilexer.rb

149
    @lexer_table.push(@default_lexer_table)
    @lexer_table.push(@default_lexer_table)
149
150
  end
  end
150
151
151
 
  WHITELIST = %w{del  ins  b    i    em   u    s    strike    font
152
 
                 big  small     sub  sup  cite code tt   var  strong
153
 
                 span h1   h2   h3   h4   h5   h6   div  center
154
 
                 blockquote     ol   li   ul   table     tr   th   td
155
 
                 ruby rb   rp   rt   p    br   hr   dl   dt   dd
156
 
                 pre  nowiki    math}
157
 
158
 
  # Sanitizes thw raw wiki input for dangerous HTML tags
159
 
  def sanitize(input)
160
 
    input.gsub(/<(\/?)([^\s>\/]+)([^>]*)>/) do
161
 
      WHITELIST.include?($2) ? $& : "&lt;#{$1}#{$2}#{$3}&gt;"
162
 
    end
163
 
  end
164
152
  
  
165
153
  def tokenize(input)
  def tokenize(input)
166
154
    @text = input
    @text = sanitize(input)
167
155
    # Current position in the input text
    # Current position in the input text
168
156
    @cursor = 0
    @cursor = 0
169
157
    # Tokens to be returned
    # Tokens to be returned
170

Changes to htmlgenerator.rb

6
require 'test/unit'
require 'test/unit'
6
7
require 'testhelper'
require 'testhelper'
7
8
8
 
require 'hpricot'
9
 
10
9
class HTMLGenerator_Test < Test::Unit::TestCase
class HTMLGenerator_Test < Test::Unit::TestCase
11
10
12
11
    include TestHelper
    include TestHelper
13
57 more lines
69
      generator = MediaWikiHTMLGenerator.new
      generator = MediaWikiHTMLGenerator.new
71
70
      generator.link_handler = link_handler if link_handler
      generator.link_handler = link_handler if link_handler
72
71
      generator.parse(ast)
      generator.parse(ast)
73
72
      assert_equal(result, generator.html, message)
      assert_same_html(result, generator.html, message)
74
73
   end
  end
75
 
76
 
  def assert_same_html(expected, result, message)
77
 
    assert_equal(Hpricot(expected).to_s, Hpricot(result).to_s, message)
78
 
  end
79
74
end
end
80
75
81
76
class LinkAttributeHandler < MediaWikiLinkHandler
class LinkAttributeHandler < MediaWikiLinkHandler
82

Changes to lexer.rb

388
      lex("<tt/>"))
      lex("<tt/>"))
388
389
    assert_equal([[:PARA_START, ""], [:TAG_START, "tt"], [:TAG_END, "tt"], [:PARA_END, ""], [false, false]],
    assert_equal([[:PARA_START, ""], [:TAG_START, "tt"], [:TAG_END, "tt"], [:PARA_END, ""], [false, false]],
389
390
      lex("<tt />"))
      lex("<tt />"))
390
391
    assert_equal([[:PARA_START, ""], [:TEXT, "<123>"], [:PARA_END, ""], [false, false]],
    assert_equal([[:PARA_START, ""], [:CHAR_ENT, "lt"], [:TEXT, "123"], [:CHAR_ENT, "gt"], [:PARA_END, ""], [false, false]],
391
392
      lex("<123>"))
      lex("<123>"))
392
393
    assert_equal([[:PARA_START, ""], [:TEXT, "<xx xx>"], [:PARA_END, ""], [false, false]],
    assert_equal([[:PARA_START, ""], [:CHAR_ENT, "lt"], [:TEXT, "xx xx"], [:CHAR_ENT, "gt"], [:PARA_END, ""], [false, false]],
393
394
      lex("<xx xx>"))
      lex("<xx xx>"))
394
395
    assert_equal([[:PARA_START, ""], [:TEXT, "</xxx "], [:PARA_END, ""], [false, false]],
    assert_equal([[:PARA_START, ""], [:TEXT, "</xxx "], [:PARA_END, ""], [false, false]],
395
396
      lex("</xxx "))
      lex("</xxx "))
396
397
    assert_equal([[:PARA_START, ""], [:TEXT, "<xx </xx>"], [:PARA_END, ""], [false, false]],
    assert_equal([[:PARA_START, ""], [:CHAR_ENT, "lt"], [:TEXT, "xx </xx"], [:CHAR_ENT, "gt"], [:PARA_END, ""], [false, false]],
397
398
      lex("<xx </xx>"))
      lex("<xx </xx>"))
398
399
    assert_equal([[:PARA_START, ""], [:TEXT, "<xx a='b' c>"], [:PARA_END, ""], [false, false]],
    assert_equal([[:PARA_START, ""], [:CHAR_ENT, "lt"], [:TEXT, "xx a='b' c"], [:CHAR_ENT, "gt"], [:PARA_END, ""], [false, false]],
399
400
      lex("<xx a='b' c>"))
      lex("<xx a='b' c>"))
400
401
    assert_equal([[:PARA_START, ""], [:TEXT, "<>"], [:PARA_END, ""], [false, false]],
    assert_equal([[:PARA_START, ""], [:TEXT, "<>"], [:PARA_END, ""], [false, false]],
401
402
      lex("<>"))
      lex("<>"))
402

Changes to sanitization.rb

 
require 'mediacloth/mediawikilexer'
1
 
2
 
require 'test/unit'
3
 
require 'testhelper'
4
 
5
 
class SanitizationTest < Test::Unit::TestCase
6
 
7
 
  def setup
8
 
    @@lexer ||= MediaWikiLexer.new
9
 
  end
10
 
11
 
  def test_sanitizes_script_tags
12
 
    assert_sanitizes_to "&lt;script&gt;alert('Unescaped code!')&lt;\/script&gt;",
13
 
                        "<script>alert('Unescaped code!')</script>"
14
 
  end
15
 
16
 
  def test_keeps_deleted_and_inserted_tags
17
 
    assert_no_sanitization "This is <del>bold</del>, and that is <ins>inserted</ins>"
18
 
  end
19
 
20
 
  def test_keeps_bold_and_italics_tags
21
 
    assert_no_sanitization "This is <b>bold</b>, this is in <i>italics</i> and that is <em>emphasized</em>"
22
 
  end
23
 
24
 
  def test_keeps_underline_and_strikethrough_tags
25
 
    assert_no_sanitization "This is very <u>important</u>, but that
26
 
                            can <s>safely</s> be <strike>ignored</strike>"
27
 
  end
28
 
29
 
  def test_keeps_font_tags
30
 
    assert_no_sanitization %{Fonts can be <font face="serif">changed</font> using HTML tags}
31
 
  end
32
 
33
 
  def test_keeps_big_and_small_tags
34
 
    assert_no_sanitization "Text can be made <big>big</big>, <small>small</small>"
35
 
  end
36
 
37
 
  def test_keeps_sub_and_superscripts
38
 
    assert_no_sanitization "We can also use <sub>sub</sub> and <sup>superscripts</sup>"
39
 
  end
40
 
41
 
  def test_keeps_citation_tags
42
 
    assert_no_sanitization %{<cite>"Perfection is achieved, not when there is
43
 
                                    nothing left to add, but when there is
44
 
                                    nothing left to remove."</cite>
45
 
                             -- Antoine de Saint-Exupery}
46
 
  end
47
 
48
 
  def test_keeps_code_and_teletype
49
 
    assert_no_sanitization "Text inside <code>code</code> and <tt>teletype</tt>
50
 
                            usually get rendered with a fixed width font"
51
 
  end
52
 
53
 
  def test_keeps_variable_tags
54
 
    assert_no_sanitization "Here is a <var>variable</var>"
55
 
  end
56
 
57
 
  def test_keeps_strong_tags
58
 
    assert_no_sanitization "That was a very <strong>strong</strong> claim"
59
 
  end
60
 
61
 
  def test_keeps_spans
62
 
    assert_no_sanitization %{Most environments will render
63
 
                             <span style="color: red">this text</span> with
64
 
                             different colours}
65
 
  end
66
 
67
 
  def test_keeps_headings
68
 
    assert_no_sanitization "<h1>Heading 1</h1>
69
 
                            <h2>Heading 2</h2>
70
 
                            <h3>Heading 3</h3>
71
 
                            <h4>Heading 4</h4>
72
 
                            <h5>Heading 5</h5>
73
 
                            <h6>Heading 6</h6>"
74
 
  end
75
 
76
 
  def test_keeps_divs
77
 
    assert_no_sanitization "<div>Division</div>"
78
 
  end
79
 
80
 
  def test_keeps_center_tags
81
 
    assert_no_sanitization "<center>Center</center>"
82
 
  end
83
 
84
 
  def test_keeps_blockquote_tags
85
 
    assert_no_sanitization "<blockquote>Blockquote</blockquote>"
86
 
  end
87
 
88
 
  def test_keeps_ordered_and_unordered_lists
89
 
    assert_no_sanitization "<ol>
90
 
                              <li>Ordered</li>
91
 
                              <li>List</li>
92
 
                              <li>(And list items)</li>
93
 
                            </ol>
94
 
95
 
                            <ul>
96
 
                              <li>Unordered</li>
97
 
                              <li>List</li>
98
 
                              <li>(And list items)</li>
99
 
                            </ul>"
100
 
  end
101
 
102
 
  def test_keeps_table_and_main_components
103
 
    assert_no_sanitization "<table>
104
 
                              <tr><th>Table</th>    <th>tag</th>   <th /></tr>
105
 
                              <tr><td>and</td>      <td>its</td>   <td>components</td></tr>
106
 
                              <tr><td>including</td><td>header</td><td>tags</td></tr>
107
 
                            </table>"
108
 
  end
109
 
110
 
  def test_keeps_ruby_tag_and_components
111
 
    assert_no_sanitization "<ruby>
112
 
                              <rb>Ruby base</rb>
113
 
                              <rp>(</rp>
114
 
                              <rt>Ruby text</rt>
115
 
                              <rp>)</rp>
116
 
                            </ruby>"
117
 
  end
118
 
119
 
  def test_keeps_paragraph_tags
120
 
    assert_no_sanitization "We can also break <p>paragraphs</p> with HTML."
121
 
  end
122
 
123
 
  def test_keeps_linebreaks
124
 
    assert_no_sanitization "Break lines with an empty element<br /><br/>
125
 
                            Or using the opening tag only <br>"
126
 
  end
127
 
128
 
  def test_keeps_horizontal_rules
129
 
    assert_no_sanitization "<hr />Display an horizontal rule"
130
 
  end
131
 
132
 
  def test_keeps_definition_lists
133
 
    assert_no_sanitization "<dl>
134
 
                              <dt>Definition terms</dt>
135
 
                              <dd>And descriptions</dt>
136
 
                            </dl>"
137
 
  end
138
 
139
 
  def test_keeps_preformatted_text
140
 
    assert_no_sanitization "<pre>Preformatted\ntext</pre>"
141
 
  end
142
 
143
 
  def test_keeps_nowiki_tags
144
 
    assert_no_sanitization "<nowiki>No wiki tag</nowiki>"
145
 
  end
146
 
147
 
  def test_keeps_math_tags
148
 
    assert_no_sanitization "<math>1 == 1</math>"
149
 
  end
150
 
151
 
  def test_sanitizes_thead_and_tbody_tags
152
 
    assert_sanitizes_to "&lt;thead&gt;Table header&lt;/thead&gt;&lt;tbody&gt;Table body&lt;/tbody&gt;",
153
 
                        "<thead>Table header</thead><tbody>Table body</tbody>"
154
 
  end
155
 
156
 
  def test_sanitizes_form_label_and_input_tags
157
 
    assert_sanitizes_to %{&lt;form action="/send" method="post"&gt;
158
 
                            &lt;label for="username"&gt;Username&lt;/label&gt;
159
 
                            &lt;input name="login" id="username" /&gt;
160
 
                          &lt;/form&gt;},
161
 
                        %{<form action="/send" method="post">
162
 
                            <label for="username">Username</label>
163
 
                            <input name="login" id="username" />
164
 
                          </form>}
165
 
  end
166
 
167
 
  # TODO sanitize tags with closing spaces
168
 
  # TODO removes "on" attributes even in legal tags
169
 
170
 
private
171
 
172
 
  def assert_sanitizes_to(expected, actual)
173
 
    assert_equal expected, @@lexer.sanitize(actual)
174
 
  end
175
 
176
 
  def assert_no_sanitization(expected)
177
 
    assert_sanitizes_to(expected, expected)
178
 
  end
179
 
180
 
end
181