You are here: Home > Latest news from Darcs > Inlines html5 gem code into Motiro

Revision 20080725225143-9043f-acb13b...

Inlines html5 gem code into Motiro

Needed for new Mediacloth sanitizer. Included to avoid requiring installation of html5 gem

vendor/html5/html5-0.10.0/History.txt
vendor/html5/html5-0.10.0/Manifest.txt
vendor/html5/html5-0.10.0/README
vendor/html5/html5-0.10.0/Rakefile.rb
vendor/html5/html5-0.10.0/bin/html5
vendor/html5/html5-0.10.0/lib/core_ext/string.rb
vendor/html5/html5-0.10.0/lib/html5/constants.rb
vendor/html5/html5-0.10.0/lib/html5/filters/base.rb
vendor/html5/html5-0.10.0/lib/html5/filters/inject_meta_charset.rb
vendor/html5/html5-0.10.0/lib/html5/filters/iso639codes.rb
vendor/html5/html5-0.10.0/lib/html5/filters/optionaltags.rb
vendor/html5/html5-0.10.0/lib/html5/filters/rfc2046.rb
vendor/html5/html5-0.10.0/lib/html5/filters/rfc3987.rb
vendor/html5/html5-0.10.0/lib/html5/filters/sanitizer.rb
vendor/html5/html5-0.10.0/lib/html5/filters/validator.rb
vendor/html5/html5-0.10.0/lib/html5/filters/whitespace.rb
vendor/html5/html5-0.10.0/lib/html5/html5parser/after_body_phase.rb
vendor/html5/html5-0.10.0/lib/html5/html5parser/after_frameset_phase.rb
vendor/html5/html5-0.10.0/lib/html5/html5parser/after_head_phase.rb
vendor/html5/html5-0.10.0/lib/html5/html5parser/before_head_phase.rb
vendor/html5/html5-0.10.0/lib/html5/html5parser/in_body_phase.rb
vendor/html5/html5-0.10.0/lib/html5/html5parser/in_caption_phase.rb
vendor/html5/html5-0.10.0/lib/html5/html5parser/in_cell_phase.rb
vendor/html5/html5-0.10.0/lib/html5/html5parser/in_column_group_phase.rb
vendor/html5/html5-0.10.0/lib/html5/html5parser/in_frameset_phase.rb
vendor/html5/html5-0.10.0/lib/html5/html5parser/in_head_phase.rb
vendor/html5/html5-0.10.0/lib/html5/html5parser/in_row_phase.rb
vendor/html5/html5-0.10.0/lib/html5/html5parser/in_select_phase.rb
vendor/html5/html5-0.10.0/lib/html5/html5parser/in_table_body_phase.rb
vendor/html5/html5-0.10.0/lib/html5/html5parser/in_table_phase.rb
vendor/html5/html5-0.10.0/lib/html5/html5parser/initial_phase.rb
vendor/html5/html5-0.10.0/lib/html5/html5parser/phase.rb
vendor/html5/html5-0.10.0/lib/html5/html5parser/root_element_phase.rb
vendor/html5/html5-0.10.0/lib/html5/html5parser/trailing_end_phase.rb
vendor/html5/html5-0.10.0/lib/html5/html5parser.rb
vendor/html5/html5-0.10.0/lib/html5/inputstream.rb
vendor/html5/html5-0.10.0/lib/html5/liberalxmlparser.rb
vendor/html5/html5-0.10.0/lib/html5/sanitizer.rb
vendor/html5/html5-0.10.0/lib/html5/serializer/htmlserializer.rb
vendor/html5/html5-0.10.0/lib/html5/serializer/xhtmlserializer.rb
vendor/html5/html5-0.10.0/lib/html5/serializer.rb
vendor/html5/html5-0.10.0/lib/html5/sniffer.rb
vendor/html5/html5-0.10.0/lib/html5/tokenizer.rb
vendor/html5/html5-0.10.0/lib/html5/treebuilders/base.rb
vendor/html5/html5-0.10.0/lib/html5/treebuilders/hpricot.rb
vendor/html5/html5-0.10.0/lib/html5/treebuilders/rexml.rb
vendor/html5/html5-0.10.0/lib/html5/treebuilders/simpletree.rb
vendor/html5/html5-0.10.0/lib/html5/treebuilders.rb
vendor/html5/html5-0.10.0/lib/html5/treewalkers/base.rb
vendor/html5/html5-0.10.0/lib/html5/treewalkers/hpricot.rb
vendor/html5/html5-0.10.0/lib/html5/treewalkers/rexml.rb
vendor/html5/html5-0.10.0/lib/html5/treewalkers/simpletree.rb
vendor/html5/html5-0.10.0/lib/html5/treewalkers.rb
vendor/html5/html5-0.10.0/lib/html5/version.rb
vendor/html5/html5-0.10.0/lib/html5.rb
vendor/html5/sanitizer.rb

Changes to History.txt

 
== 0.10.0 2007-10-08
1
 
* proof-of-concept validator
2
 
* easier to localize error reporting
3
 
* many unit tests
4
 
5
 
== 0.1.0 / 2007-08-07
6
 
7
 
* 1 major enhancement
8
 
  * Birthday!
9
 
10

Changes to Manifest.txt

 
History.txt
1
 
Manifest.txt
2
 
README
3
 
Rakefile.rb
4
 
bin/html5
5
 
lib/core_ext/string.rb
6
 
lib/html5.rb
7
 
lib/html5/constants.rb
8
 
lib/html5/filters/base.rb
9
 
lib/html5/filters/inject_meta_charset.rb
10
 
lib/html5/filters/iso639codes.rb
11
 
lib/html5/filters/optionaltags.rb
12
 
lib/html5/filters/rfc2046.rb
13
 
lib/html5/filters/rfc3987.rb
14
 
lib/html5/filters/sanitizer.rb
15
 
lib/html5/filters/validator.rb
16
 
lib/html5/filters/whitespace.rb
17
 
lib/html5/html5parser.rb
18
 
lib/html5/html5parser/after_body_phase.rb
19
 
lib/html5/html5parser/after_frameset_phase.rb
20
 
lib/html5/html5parser/after_head_phase.rb
21
 
lib/html5/html5parser/before_head_phase.rb
22
 
lib/html5/html5parser/in_body_phase.rb
23
 
lib/html5/html5parser/in_caption_phase.rb
24
 
lib/html5/html5parser/in_cell_phase.rb
25
 
lib/html5/html5parser/in_column_group_phase.rb
26
 
lib/html5/html5parser/in_frameset_phase.rb
27
 
lib/html5/html5parser/in_head_phase.rb
28
 
lib/html5/html5parser/in_row_phase.rb
29
 
lib/html5/html5parser/in_select_phase.rb
30
 
lib/html5/html5parser/in_table_body_phase.rb
31
 
lib/html5/html5parser/in_table_phase.rb
32
 
lib/html5/html5parser/initial_phase.rb
33
 
lib/html5/html5parser/phase.rb
34
 
lib/html5/html5parser/root_element_phase.rb
35
 
lib/html5/html5parser/trailing_end_phase.rb
36
 
lib/html5/inputstream.rb
37
 
lib/html5/liberalxmlparser.rb
38
 
lib/html5/sanitizer.rb
39
 
lib/html5/serializer.rb
40
 
lib/html5/serializer/htmlserializer.rb
41
 
lib/html5/serializer/xhtmlserializer.rb
42
 
lib/html5/sniffer.rb
43
 
lib/html5/tokenizer.rb
44
 
lib/html5/treebuilders.rb
45
 
lib/html5/treebuilders/base.rb
46
 
lib/html5/treebuilders/hpricot.rb
47
 
lib/html5/treebuilders/rexml.rb
48
 
lib/html5/treebuilders/simpletree.rb
49
 
lib/html5/treewalkers.rb
50
 
lib/html5/treewalkers/base.rb
51
 
lib/html5/treewalkers/hpricot.rb
52
 
lib/html5/treewalkers/rexml.rb
53
 
lib/html5/treewalkers/simpletree.rb
54
 
lib/html5/version.rb
55
 
testdata/encoding/chardet/test_big5.txt
56
 
testdata/encoding/test-yahoo-jp.dat
57
 
testdata/encoding/tests1.dat
58
 
testdata/encoding/tests2.dat
59
 
testdata/sanitizer/tests1.dat
60
 
testdata/serializer/core.test
61
 
testdata/serializer/injectmeta.test
62
 
testdata/serializer/optionaltags.test
63
 
testdata/serializer/options.test
64
 
testdata/serializer/whitespace.test
65
 
testdata/sites/google-results.htm
66
 
testdata/sites/python-ref-import.htm
67
 
testdata/sites/web-apps-old.htm
68
 
testdata/sites/web-apps.htm
69
 
testdata/sniffer/htmlOrFeed.json
70
 
testdata/tokenizer/contentModelFlags.test
71
 
testdata/tokenizer/entities.test
72
 
testdata/tokenizer/escapeFlag.test
73
 
testdata/tokenizer/test1.test
74
 
testdata/tokenizer/test2.test
75
 
testdata/tokenizer/test3.test
76
 
testdata/tokenizer/test4.test
77
 
testdata/tree-construction/tests1.dat
78
 
testdata/tree-construction/tests2.dat
79
 
testdata/tree-construction/tests3.dat
80
 
testdata/tree-construction/tests4.dat
81
 
testdata/tree-construction/tests5.dat
82
 
testdata/tree-construction/tests6.dat
83
 
testdata/validator/attributes.test
84
 
testdata/validator/base-href-attribute.test
85
 
testdata/validator/base-target-attribute.test
86
 
testdata/validator/blockquote-cite-attribute.test
87
 
testdata/validator/classattribute.test
88
 
testdata/validator/contenteditableattribute.test
89
 
testdata/validator/contextmenuattribute.test
90
 
testdata/validator/dirattribute.test
91
 
testdata/validator/draggableattribute.test
92
 
testdata/validator/html-xmlns-attribute.test
93
 
testdata/validator/idattribute.test
94
 
testdata/validator/inputattributes.test
95
 
testdata/validator/irrelevantattribute.test
96
 
testdata/validator/langattribute.test
97
 
testdata/validator/li-value-attribute.test
98
 
testdata/validator/link-href-attribute.test
99
 
testdata/validator/link-hreflang-attribute.test
100
 
testdata/validator/link-rel-attribute.test
101
 
testdata/validator/ol-start-attribute.test
102
 
testdata/validator/starttags.test
103
 
testdata/validator/style-scoped-attribute.test
104
 
testdata/validator/tabindexattribute.test
105
 
tests/preamble.rb
106
 
tests/test_encoding.rb
107
 
tests/test_lxp.rb
108
 
tests/test_parser.rb
109
 
tests/test_sanitizer.rb
110
 
tests/test_serializer.rb
111
 
tests/test_sniffer.rb
112
 
tests/test_stream.rb
113
 
tests/test_tokenizer.rb
114
 
tests/test_treewalkers.rb
115
 
tests/test_validator.rb
116
 
tests/tokenizer_test_parser.rb
117

Changes to README

 
html5
1
 
    by Ryan King, et al
2
 
    http://code.google.com/p/html5lib
3
 
4
 
== DESCRIPTION:
5
 
6
 
A ruby implementation of the parsing algorithm in HTML5.
7
 
8
 
9
 
== FEATURES/PROBLEMS:
10
 
11
 
12
 
13
 
== SYNOPSIS:
14
 
15
 
  TODO
16
 
17
 
== REQUIREMENTS:
18
 
19
 
* chardet, only tested with 0.9.0
20
 
21
 
== INSTALL:
22
 
23
 
* sudo gem install html5
24
 
25
 
== LICENSE:
26
 
27
 
Copyright (c) 2006-2007 The Authors
28
 
29
 
Contributers:
30
 
James Graham - jg307@cam.ac.uk
31
 
Anne van Kesteren - annevankesteren@gmail.com
32
 
Lachlan Hunt - lachlan.hunt@lachy.id.au
33
 
Matt McDonald - kanashii@kanashii.ca
34
 
Sam Ruby - rubys@intertwingly.net
35
 
Ian Hickson (Google) - ian@hixie.ch
36
 
Thomas Broyer - t.broyer@ltgt.net
37
 
Jacques Distler - distler@golem.ph.utexas.edu
38
 
Ryan King - ryan@theryanking.com
39
 
40
 
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
41
 
42
 
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
43
 
44
 
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
45

Changes to Rakefile.rb

 
require 'rake'
1
 
require 'hoe'
2
 
require 'lib/html5/version'
3
 
4
 
Hoe.new("html5", HTML5::VERSION) do |p|
5
 
  p.name = "html5"
6
 
  p.description = p.paragraphs_of('README', 2..5).join("\n\n")
7
 
  p.summary = "HTML5 parser/tokenizer."
8
 
9
 
  p.author   = ['Ryan King'] # TODO: add more names
10
 
  p.email    = 'ryan@theryanking.com'
11
 
  p.url      = 'http://code.google.com/p/html5lib'
12
 
  p.need_zip = true
13
 
14
 
  p.extra_deps << ['chardet', '>= 0.9.0']
15
 
  p.changes = p.paragraphs_of('History.txt', 0..1).join("\n\n")
16
 
end
17
 
18
 
require 'rcov/rcovtask'
19
 
20
 
namespace :test do 
21
 
  namespace :coverage do
22
 
    desc "Delete aggregate coverage data."
23
 
    task(:clean) { rm_f "coverage.data" }
24
 
  end
25
 
  desc 'Aggregate code coverage for unit, functional and integration tests'
26
 
  Rcov::RcovTask.new(:coverage => "test:coverage:clean") do |t|
27
 
    t.libs << "tests"
28
 
    t.test_files = FileList["tests/test_*.rb"]
29
 
    t.output_dir = "tests/coverage/"
30
 
    t.verbose = true
31
 
  end
32
 
end
33

Changes to html5

 
#!/usr/bin/env ruby
1
 
2
 
require 'core_ext/string'
3
 
$:.unshift File.dirname(__FILE__), 'lib'
4
 
5
 
def parse(opts, args)
6
 
  encoding = nil
7
 
8
 
  f = args[-1]
9
 
  if f
10
 
    begin
11
 
      if f[0..6] == 'http://'
12
 
        require 'open-uri'
13
 
        f = URI.parse(f).open
14
 
        encoding = f.charset
15
 
      elsif f == '-'
16
 
        f = $stdin
17
 
      else
18
 
        f = open(f)
19
 
      end
20
 
    rescue
21
 
    end
22
 
  else
23
 
    $stderr.write("No filename provided. Use -h for help\n")
24
 
    exit(1)
25
 
  end
26
 
27
 
  require 'html5/treebuilders'
28
 
  treebuilder = HTML5::TreeBuilders[opts.treebuilder]
29
 
30
 
  if opts.output == :xml
31
 
    require 'html5/liberalxmlparser'
32
 
    p = HTML5::XMLParser.new(:tree=>treebuilder)
33
 
  else
34
 
    require 'html5/html5parser'
35
 
    p = HTML5::HTMLParser.new(:tree=>treebuilder)
36
 
  end
37
 
38
 
  if opts.parsemethod == :parse
39
 
    args = [f, encoding]
40
 
  else
41
 
    args = [f, (opts.container || 'div'), encoding]
42
 
  end
43
 
44
 
  if opts.profile
45
 
    require 'profiler'
46
 
    Profiler__::start_profile
47
 
    p.send(opts.parsemethod, *args)
48
 
    Profiler__::stop_profile
49
 
    Profiler__::print_profile($stderr)
50
 
  elsif opts.time
51
 
    require 'time' # TODO: switch to benchmark
52
 
    t0 = Time.new
53
 
    document = p.send(opts.parsemethod, *args)
54
 
    t1 = Time.new
55
 
    print_output(p, document, opts)
56
 
    t2 = Time.new
57
 
    puts "\n\nRun took: %fs (plus %fs to print the output)"%[t1-t0, t2-t1]
58
 
  else
59
 
    document = p.send(opts.parsemethod, *args)
60
 
    print_output(p, document, opts)
61
 
  end
62
 
end
63
 
64
 
def print_output(parser, document, opts)
65
 
  puts "Encoding: #{parser.tokenizer.stream.char_encoding}" if opts.encoding
66
 
67
 
  case opts.output
68
 
  when :xml
69
 
    print document
70
 
  when :html
71
 
    require 'html5/treewalkers'
72
 
    tokens = HTML5::TreeWalkers[opts.treebuilder].new(document)
73
 
    require 'html5/serializer'
74
 
    puts HTML5::HTMLSerializer.serialize(tokens, opts.serializer)
75
 
  when :hilite
76
 
    print document.hilite
77
 
  when :tree
78
 
    document = [document] unless document.respond_to?(:each)
79
 
    document.each {|fragment| puts parser.tree.testSerializer(fragment)}
80
 
  end
81
 
82
 
  if opts.error
83
 
    errList=[]
84
 
    for pos, errorcode, datavars in parser.errors
85
 
      errList << "Line #{pos[0]} Col #{pos[1]} " + (HTML5::E[errorcode] || "Unknown error \"#{errorcode}\"") % datavars
86
 
    end
87
 
    $stdout.write("\nParse errors:\n" + errList.join("\n")+"\n")
88
 
  end
89
 
end
90
 
91
 
require 'ostruct'
92
 
options = OpenStruct.new
93
 
options.profile = false
94
 
options.time = false
95
 
options.output = :html
96
 
options.treebuilder = 'simpletree'
97
 
options.error = false
98
 
options.encoding = false
99
 
options.parsemethod = :parse
100
 
options.serializer = {
101
 
  :encoding => 'utf-8',
102
 
  :omit_optional_tags => false,
103
 
  :inject_meta_charset => false
104
 
}
105
 
106
 
require 'optparse'
107
 
opts = OptionParser.new do |opts|
108
 
  opts.separator ""
109
 
  opts.separator "Parse Options:"
110
 
111
 
  opts.on("-b", "--treebuilder NAME") do |treebuilder|
112
 
    options.treebuilder = treebuilder
113
 
  end
114
 
115
 
  opts.on("-f", "--fragment CONTAINER", "Parse as a fragment") do |container|
116
 
    options.parsemethod = :parse_fragment
117
 
    options.container = container if container
118
 
  end
119
 
120
 
  opts.separator ""
121
 
  opts.separator "Filter Options:"
122
 
123
 
  opts.on("--[no-]inject-meta-charset", "inject <meta charset>") do |inject|
124
 
    options.serializer[:inject_meta_charset] = inject
125
 
  end
126
 
127
 
  opts.on("--[no-]strip-whitespace", "strip unnecessary whitespace") do |strip|
128
 
    options.serializer[:strip_whitespace] = strip
129
 
  end
130
 
131
 
  opts.on("--[no-]sanitize", "escape unsafe tags") do |sanitize|
132
 
    options.serializer[:sanitize] = sanitize
133
 
  end
134
 
135
 
  opts.separator ""
136
 
  opts.separator "Output Options:"
137
 
138
 
  opts.on("--tree", "output as debug tree") do |tree|
139
 
    options.output = :tree
140
 
  end
141
 
  
142
 
  opts.on("-x", "--xml", "output as xml") do |xml|
143
 
    options.output = :xml
144
 
    options.treebuilder = "rexml"
145
 
  end
146
 
  
147
 
  opts.on("--[no-]html", "Output as html") do |html|
148
 
    options.output = (html ? :html : nil)
149
 
  end
150
 
  
151
 
  opts.on("--hilite", "Output as formatted highlighted code.") do |hilite|
152
 
    options.output = :hilite
153
 
  end
154
 
  
155
 
  opts.on("-e", "--error", "Print a list of parse errors") do |error|
156
 
    options.error = error
157
 
  end
158
 
159
 
  opts.separator ""
160
 
  opts.separator "Serialization Options:"
161
 
162
 
  opts.on("--[no-]omit-optional-tags", "Omit optional tags") do |omit|
163
 
    options.serializer[:omit_optional_tags] = omit
164
 
  end
165
 
166
 
  opts.on("--[no-]quote-attr-values", "Quote attribute values") do |quote|
167
 
    options.serializer[:quote_attr_values] = quote
168
 
  end
169
 
170
 
  opts.on("--[no-]use-best-quote-char", "Use best quote character") do |best|
171
 
    options.serializer[:use_best_quote_char] = best
172
 
  end
173
 
174
 
  opts.on("--quote-char C", "Use specified quote character") do |c|
175
 
    options.serializer[:quote_char] = c
176
 
  end
177
 
178
 
  opts.on("--[no-]minimize-boolean-attributes", "Minimize boolean attributes") do |min|
179
 
    options.serializer[:minimize_boolean_attributes] = min
180
 
  end
181
 
182
 
  opts.on("--[no-]use-trailing-solidus", "Use trailing solidus") do |slash|
183
 
    options.serializer[:use_trailing_solidus] = slash
184
 
  end
185
 
186
 
  opts.on("--[no-]escape-lt-in-attrs", "Escape less than signs in attribute values") do |lt|
187
 
    options.serializer[:escape_lt_in_attrs] = lt
188
 
  end
189
 
190
 
  opts.on("--[no-]escape-rcdata", "Escape rcdata element values") do |rcdata|
191
 
    options.serializer[:escape_rcdata] = rcdata
192
 
  end
193
 
194
 
  opts.separator ""
195
 
  opts.separator "Other Options:"
196
 
197
 
  opts.on("-p", "--[no-]profile", "Profile the run") do |profile|
198
 
    options.profile = profile
199
 
  end
200
 
    
201
 
  opts.on("-t", "--[no-]time", "Time the run") do |time|
202
 
    options.time = time
203
 
  end
204
 
    
205
 
  opts.on("-c", "--[no-]encoding", "Print character encoding used") do |encoding|
206
 
    options.encoding = encoding
207
 
  end
208
 
209
 
  opts.on_tail("-h", "--help", "Show this message") do
210
 
    puts opts
211
 
    exit
212
 
  end
213
 
end
214
 
215
 
opts.parse!(ARGV)
216
 
parse options, ARGV
217

Changes to string.rb

 
class String
1
 
  alias old_format %
2
 
  define_method("%") do |data|
3
 
    unless data.kind_of?(Hash)
4
 
      $VERBOSE = false
5
 
      r = old_format(data)
6
 
      $VERBOSE = true
7
 
      r
8
 
    else
9
 
      ret = self.clone
10
 
      data.each do |k,v|
11
 
        ret.gsub!(/\%\(#{k}\)/, v)
12
 
      end
13
 
      ret
14
 
    end
15
 
  end
16
 
end
17

Changes to constants.rb

 
module HTML5
1
 
2
 
  class EOF < Exception; end
3
 
4
 
  def self._(str); str end
5
 
6
 
  CONTENT_MODEL_FLAGS = [
7
 
      :PCDATA,
8
 
      :RCDATA,
9
 
      :CDATA,
10
 
      :PLAINTEXT
11
 
  ]
12
 
13
 
  SCOPING_ELEMENTS = %w[
14
 
      button
15
 
      caption
16
 
      html
17
 
      marquee
18
 
      object
19
 
      table
20
 
      td
21
 
      th
22
 
  ]
23
 
24
 
  FORMATTING_ELEMENTS = %w[
25
 
      a
26
 
      b
27
 
      big
28
 
      em
29
 
      font
30
 
      i
31
 
      nobr
32
 
      s
33
 
      small
34
 
      strike
35
 
      strong
36
 
      tt
37
 
      u
38
 
  ]
39
 
40
 
  SPECIAL_ELEMENTS = %w[
41
 
      address
42
 
      area
43
 
      base
44
 
      basefont
45
 
      bgsound
46
 
      blockquote
47
 
      body
48
 
      br
49
 
      center
50
 
      col
51
 
      colgroup
52
 
      dd
53
 
      dir
54
 
      div
55
 
      dl
56
 
      dt
57
 
      embed
58
 
      fieldset
59
 
      form
60
 
      frame
61
 
      frameset
62
 
      h1
63
 
      h2
64
 
      h3
65
 
      h4
66
 
      h5
67
 
      h6
68
 
      head
69
 
      hr
70
 
      iframe
71
 
      image
72
 
      img
73
 
      input
74
 
      isindex
75
 
      li
76
 
      link
77
 
      listing
78
 
      menu
79
 
      meta
80
 
      noembed
81
 
      noframes
82
 
      noscript
83
 
      ol
84
 
      optgroup
85
 
      option
86
 
      p
87
 
      param
88
 
      plaintext
89
 
      pre
90
 
      script
91
 
      select
92
 
      spacer
93
 
      style
94
 
      tbody
95
 
      textarea
96
 
      tfoot
97
 
      thead
98
 
      title
99
 
      tr
100
 
      ul
101
 
      wbr
102
 
  ]
103
 
104
 
  SPACE_CHARACTERS = %W[
105
 
      \t
106
 
      \n
107
 
      \x0B
108
 
      \x0C
109
 
      \x20
110
 
      \r
111
 
  ]
112
 
113
 
  TABLE_INSERT_MODE_ELEMENTS = %w[
114
 
      table
115
 
      tbody
116
 
      tfoot
117
 
      thead
118
 
      tr
119
 
  ]
120
 
121
 
  ASCII_LOWERCASE = ('a'..'z').to_a.join('')
122
 
  ASCII_UPPERCASE = ('A'..'Z').to_a.join('')
123
 
  ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE
124
 
  DIGITS = '0'..'9'
125
 
  HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a
126
 
127
 
  # Heading elements need to be ordered 
128
 
  HEADING_ELEMENTS = %w[
129
 
      h1
130
 
      h2
131
 
      h3
132
 
      h4
133
 
      h5
134
 
      h6
135
 
  ]
136
 
137
 
  # XXX What about event-source and command?
138
 
  VOID_ELEMENTS = %w[
139
 
      base
140
 
      link
141
 
      meta
142
 
      hr
143
 
      br
144
 
      img
145
 
      embed
146
 
      param
147
 
      area
148
 
      col
149
 
      input
150
 
  ]
151
 
152
 
  CDATA_ELEMENTS = %w[title textarea]
153
 
154
 
  RCDATA_ELEMENTS = %w[
155
 
    style
156
 
    script
157
 
    xmp
158
 
    iframe
159
 
    noembed
160
 
    noframes
161
 
    noscript
162
 
  ]
163
 
164
 
  BOOLEAN_ATTRIBUTES = {
165
 
    :global    => %w[irrelevant],
166
 
    'style'    => %w[scoped],
167
 
    'img'      => %w[ismap],
168
 
    'audio'    => %w[autoplay controls],
169
 
    'video'    => %w[autoplay controls],
170
 
    'script'   => %w[defer async],
171
 
    'details'  => %w[open],
172
 
    'datagrid' => %w[multiple disabled],
173
 
    'command'  => %w[hidden disabled checked default],
174
 
    'menu'     => %w[autosubmit],
175
 
    'fieldset' => %w[disabled readonly],
176
 
    'option'   => %w[disabled readonly selected],
177
 
    'optgroup' => %w[disabled readonly],
178
 
    'button'   => %w[disabled autofocus],
179
 
    'input'    => %w[disabled readonly required autofocus checked ismap],
180
 
    'select'   => %w[disabled readonly autofocus multiple],
181
 
    'output'   => %w[disabled readonly]
182
 
183
 
  }
184
 
185
 
  # entitiesWindows1252 has to be _ordered_ and needs to have an index.
186
 
  ENTITIES_WINDOWS1252 = [
187
 
      8364,  # 0x80  0x20AC  EURO SIGN
188
 
      65533, # 0x81          UNDEFINED
189
 
      8218,  # 0x82  0x201A  SINGLE LOW-9 QUOTATION MARK
190
 
      402,   # 0x83  0x0192  LATIN SMALL LETTER F WITH HOOK
191
 
      8222,  # 0x84  0x201E  DOUBLE LOW-9 QUOTATION MARK
192
 
      8230,  # 0x85  0x2026  HORIZONTAL ELLIPSIS
193
 
      8224,  # 0x86  0x2020  DAGGER
194
 
      8225,  # 0x87  0x2021  DOUBLE DAGGER
195
 
      710,   # 0x88  0x02C6  MODIFIER LETTER CIRCUMFLEX ACCENT
196
 
      8240,  # 0x89  0x2030  PER MILLE SIGN
197
 
      352,   # 0x8A  0x0160  LATIN CAPITAL LETTER S WITH CARON
198
 
      8249,  # 0x8B  0x2039  SINGLE LEFT-POINTING ANGLE QUOTATION MARK
199
 
      338,   # 0x8C  0x0152  LATIN CAPITAL LIGATURE OE
200
 
      65533, # 0x8D          UNDEFINED
201
 
      381,   # 0x8E  0x017D  LATIN CAPITAL LETTER Z WITH CARON
202
 
      65533, # 0x8F          UNDEFINED
203
 
      65533, # 0x90          UNDEFINED
204
 
      8216,  # 0x91  0x2018  LEFT SINGLE QUOTATION MARK
205
 
      8217,  # 0x92  0x2019  RIGHT SINGLE QUOTATION MARK
206
 
      8220,  # 0x93  0x201C  LEFT DOUBLE QUOTATION MARK
207
 
      8221,  # 0x94  0x201D  RIGHT DOUBLE QUOTATION MARK
208
 
      8226,  # 0x95  0x2022  BULLET
209
 
      8211,  # 0x96  0x2013  EN DASH
210
 
      8212,  # 0x97  0x2014  EM DASH
211
 
      732,   # 0x98  0x02DC  SMALL TILDE
212
 
      8482,  # 0x99  0x2122  TRADE MARK SIGN
213
 
      353,   # 0x9A  0x0161  LATIN SMALL LETTER S WITH CARON
214
 
      8250,  # 0x9B  0x203A  SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
215
 
      339,   # 0x9C  0x0153  LATIN SMALL LIGATURE OE
216
 
      65533, # 0x9D          UNDEFINED
217
 
      382,   # 0x9E  0x017E  LATIN SMALL LETTER Z WITH CARON
218
 
      376    # 0x9F  0x0178  LATIN CAPITAL LETTER Y WITH DIAERESIS
219
 
  ]
220
 
221
 
  # ENTITIES was generated from Python using the following code:
222
 
  #
223
 
  # import constants
224
 
  # entities = constants.entities.items()
225
 
  # entities.sort()
226
 
  # list = [ ' '.join([repr(entity), '=>', ord(value)<128 and 
227
 
  #   repr(str(value)) or repr(value.encode('utf-8')).replace("'",'"')])
228
 
  #   for entity, value in entities]
229
 
  #   print '  ENTITIES = {\n    ' + ',\n    '.join(list) + '\n  }'
230
 
231
 
  ENTITIES = {
232
 
    'AElig'     => "\xc3\x86",
233
 
    'AElig;'    => "\xc3\x86",
234
 
    'AMP'       => '&',
235
 
    'AMP;'      => '&',
236
 
    'Aacute'    => "\xc3\x81",
237
 
    'Aacute;'   => "\xc3\x81",
238
 
    'Acirc'     => "\xc3\x82",
239
 
    'Acirc;'    => "\xc3\x82",
240
 
    'Agrave'    => "\xc3\x80",
241
 
    'Agrave;'   => "\xc3\x80",
242
 
    'Alpha;'    => "\xce\x91",
243
 
    'Aring'     => "\xc3\x85",
244
 
    'Aring;'    => "\xc3\x85",
245
 
    'Atilde'    => "\xc3\x83",
246
 
    'Atilde;'   => "\xc3\x83",
247
 
    'Auml'      => "\xc3\x84",
248
 
    'Auml;'     => "\xc3\x84",
249
 
    'Beta;'     => "\xce\x92",
250
 
    'COPY'      => "\xc2\xa9",
251
 
    'COPY;'     => "\xc2\xa9",
252
 
    'Ccedil'    => "\xc3\x87",
253
 
    'Ccedil;'   => "\xc3\x87",
254
 
    'Chi;'      => "\xce\xa7",
255
 
    'Dagger;'   => "\xe2\x80\xa1",
256
 
    'Delta;'    => "\xce\x94",
257
 
    'ETH'       => "\xc3\x90",
258
 
    'ETH;'      => "\xc3\x90",
259
 
    'Eacute'    => "\xc3\x89",
260
 
    'Eacute;'   => "\xc3\x89",
261
 
    'Ecirc'     => "\xc3\x8a",
262
 
    'Ecirc;'    => "\xc3\x8a",
263
 
    'Egrave'    => "\xc3\x88",
264
 
    'Egrave;'   => "\xc3\x88",
265
 
    'Epsilon;'  => "\xce\x95",
266
 
    'Eta;'      => "\xce\x97",
267
 
    'Euml'      => "\xc3\x8b",
268
 
    'Euml;'     => "\xc3\x8b",
269
 
    'GT'        => '>',
270
 
    'GT;'       => '>',
271
 
    'Gamma;'    => "\xce\x93",
272
 
    'Iacute'    => "\xc3\x8d",
273
 
    'Iacute;'   => "\xc3\x8d",
274
 
    'Icirc'     => "\xc3\x8e",
275
 
    'Icirc;'    => "\xc3\x8e",
276
 
    'Igrave'    => "\xc3\x8c",
277
 
    'Igrave;'   => "\xc3\x8c",
278
 
    'Iota;'     => "\xce\x99",
279
 
    'Iuml'      => "\xc3\x8f",
280
 
    'Iuml;'     => "\xc3\x8f",
281
 
    'Kappa;'    => "\xce\x9a",
282
 
    'LT'        => '<',
283
 
    'LT;'       => '<',
284
 
    'Lambda;'   => "\xce\x9b",
285
 
    'Mu;'       => "\xce\x9c",
286
 
    'Ntilde'    => "\xc3\x91",
287
 
    'Ntilde;'   => "\xc3\x91",
288
 
    'Nu;'       => "\xce\x9d",
289
 
    'OElig;'    => "\xc5\x92",
290
 
    'Oacute'    => "\xc3\x93",
291
 
    'Oacute;'   => "\xc3\x93",
292
 
    'Ocirc'     => "\xc3\x94",
293
 
    'Ocirc;'    => "\xc3\x94",
294
 
    'Ograve'    => "\xc3\x92",
295
 
    'Ograve;'   => "\xc3\x92",
296
 
    'Omega;'    => "\xce\xa9",
297
 
    'Omicron;'  => "\xce\x9f",
298
 
    'Oslash'    => "\xc3\x98",
299
 
    'Oslash;'   => "\xc3\x98",
300
 
    'Otilde'    => "\xc3\x95",
301
 
    'Otilde;'   => "\xc3\x95",
302
 
    'Ouml'      => "\xc3\x96",
303
 
    'Ouml;'     => "\xc3\x96",
304
 
    'Phi;'      => "\xce\xa6",
305
 
    'Pi;'       => "\xce\xa0",
306
 
    'Prime;'    => "\xe2\x80\xb3",
307
 
    'Psi;'      => "\xce\xa8",
308
 
    'QUOT'      => '"',
309
 
    'QUOT;'     => '"',
310
 
    'REG'       => "\xc2\xae",
311
 
    'REG;'      => "\xc2\xae",
312
 
    'Rho;'      => "\xce\xa1",
313
 
    'Scaron;'   => "\xc5\xa0",
314
 
    'Sigma;'    => "\xce\xa3",
315
 
    'THORN'     => "\xc3\x9e",
316
 
    'THORN;'    => "\xc3\x9e",
317
 
    'TRADE;'    => "\xe2\x84\xa2",
318
 
    'Tau;'      => "\xce\xa4",
319
 
    'Theta;'    => "\xce\x98",
320
 
    'Uacute'    => "\xc3\x9a",
321
 
    'Uacute;'   => "\xc3\x9a",
322
 
    'Ucirc'     => "\xc3\x9b",
323
 
    'Ucirc;'    => "\xc3\x9b",
324
 
    'Ugrave'    => "\xc3\x99",
325
 
    'Ugrave;'   => "\xc3\x99",
326
 
    'Upsilon;'  => "\xce\xa5",
327
 
    'Uuml'      => "\xc3\x9c",
328
 
    'Uuml;'     => "\xc3\x9c",
329
 
    'Xi;'       => "\xce\x9e",
330
 
    'Yacute'    => "\xc3\x9d",
331
 
    'Yacute;'   => "\xc3\x9d",
332
 
    'Yuml;'     => "\xc5\xb8",
333
 
    'Zeta;'     => "\xce\x96",
334
 
    'aacute'    => "\xc3\xa1",
335
 
    'aacute;'   => "\xc3\xa1",
336
 
    'acirc'     => "\xc3\xa2",
337
 
    'acirc;'    => "\xc3\xa2",
338
 
    'acute'     => "\xc2\xb4",
339
 
    'acute;'    => "\xc2\xb4",
340
 
    'aelig'     => "\xc3\xa6",
341
 
    'aelig;'    => "\xc3\xa6",
342
 
    'agrave'    => "\xc3\xa0",
343
 
    'agrave;'   => "\xc3\xa0",
344
 
    'alefsym;'  => "\xe2\x84\xb5",
345
 
    'alpha;'    => "\xce\xb1",
346
 
    'amp'       => '&',
347
 
    'amp;'      => '&',
348
 
    'and;'      => "\xe2\x88\xa7",
349
 
    'ang;'      => "\xe2\x88\xa0",
350
 
    'apos;'     => "'",
351
 
    'aring'     => "\xc3\xa5",
352
 
    'aring;'    => "\xc3\xa5",
353
 
    'asymp;'    => "\xe2\x89\x88",
354
 
    'atilde'    => "\xc3\xa3",
355
 
    'atilde;'   => "\xc3\xa3",
356
 
    'auml'      => "\xc3\xa4",
357
 
    'auml;'     => "\xc3\xa4",
358
 
    'bdquo;'    => "\xe2\x80\x9e",
359
 
    'beta;'     => "\xce\xb2",
360
 
    'brvbar'    => "\xc2\xa6",
361
 
    'brvbar;'   => "\xc2\xa6",
362
 
    'bull;'     => "\xe2\x80\xa2",
363
 
    'cap;'      => "\xe2\x88\xa9",
364
 
    'ccedil'    => "\xc3\xa7",
365
 
    'ccedil;'   => "\xc3\xa7",
366
 
    'cedil'     => "\xc2\xb8",
367
 
    'cedil;'    => "\xc2\xb8",
368
 
    'cent'      => "\xc2\xa2",
369
 
    'cent;'     => "\xc2\xa2",
370
 
    'chi;'      => "\xcf\x87",
371
 
    'circ;'     => "\xcb\x86",
372
 
    'clubs;'    => "\xe2\x99\xa3",
373
 
    'cong;'     => "\xe2\x89\x85",
374
 
    'copy'      => "\xc2\xa9",
375
 
    'copy;'     => "\xc2\xa9",
376
 
    'crarr;'    => "\xe2\x86\xb5",
377
 
    'cup;'      => "\xe2\x88\xaa",
378
 
    'curren'    => "\xc2\xa4",
379
 
    'curren;'   => "\xc2\xa4",
380
 
    'dArr;'     => "\xe2\x87\x93",
381
 
    'dagger;'   => "\xe2\x80\xa0",
382
 
    'darr;'     => "\xe2\x86\x93",
383
 
    'deg'       => "\xc2\xb0",
384
 
    'deg;'      => "\xc2\xb0",
385
 
    'delta;'    => "\xce\xb4",
386
 
    'diams;'    => "\xe2\x99\xa6",
387
 
    'divide'    => "\xc3\xb7",
388
 
    'divide;'   => "\xc3\xb7",
389
 
    'eacute'    => "\xc3\xa9",
390
 
    'eacute;'   => "\xc3\xa9",
391
 
    'ecirc'     => "\xc3\xaa",
392
 
    'ecirc;'    => "\xc3\xaa",
393
 
    'egrave'    => "\xc3\xa8",
394
 
    'egrave;'   => "\xc3\xa8",
395
 
    'empty;'    => "\xe2\x88\x85",
396
 
    'emsp;'     => "\xe2\x80\x83",
397
 
    'ensp;'     => "\xe2\x80\x82",
398
 
    'epsilon;'  => "\xce\xb5",
399
 
    'equiv;'    => "\xe2\x89\xa1",
400
 
    'eta;'      => "\xce\xb7",
401
 
    'eth'       => "\xc3\xb0",
402
 
    'eth;'      => "\xc3\xb0",
403
 
    'euml'      => "\xc3\xab",
404
 
    'euml;'     => "\xc3\xab",
405
 
    'euro;'     => "\xe2\x82\xac",
406
 
    'exist;'    => "\xe2\x88\x83",
407
 
    'fnof;'     => "\xc6\x92",
408
 
    'forall;'   => "\xe2\x88\x80",
409
 
    'frac12'    => "\xc2\xbd",
410
 
    'frac12;'   => "\xc2\xbd",
411
 
    'frac14'    => "\xc2\xbc",
412
 
    'frac14;'   => "\xc2\xbc",
413
 
    'frac34'    => "\xc2\xbe",
414
 
    'frac34;'   => "\xc2\xbe",
415
 
    'frasl;'    => "\xe2\x81\x84",
416
 
    'gamma;'    => "\xce\xb3",
417
 
    'ge;'       => "\xe2\x89\xa5",
418
 
    'gt'        => '>',
419
 
    'gt;'       => '>',
420
 
    'hArr;'     => "\xe2\x87\x94",
421
 
    'harr;'     => "\xe2\x86\x94",
422
 
    'hearts;'   => "\xe2\x99\xa5",
423
 
    'hellip;'   => "\xe2\x80\xa6",
424
 
    'iacute'    => "\xc3\xad",
425
 
    'iacute;'   => "\xc3\xad",
426
 
    'icirc'     => "\xc3\xae",
427
 
    'icirc;'    => "\xc3\xae",
428
 
    'iexcl'     => "\xc2\xa1",
429
 
    'iexcl;'    => "\xc2\xa1",
430
 
    'igrave'    => "\xc3\xac",
431
 
    'igrave;'   => "\xc3\xac",
432
 
    'image;'    => "\xe2\x84\x91",
433
 
    'infin;'    => "\xe2\x88\x9e",
434
 
    'int;'      => "\xe2\x88\xab",
435
 
    'iota;'     => "\xce\xb9",
436
 
    'iquest'    => "\xc2\xbf",
437
 
    'iquest;'   => "\xc2\xbf",
438
 
    'isin;'     => "\xe2\x88\x88",
439
 
    'iuml'      => "\xc3\xaf",
440
 
    'iuml;'     => "\xc3\xaf",
441
 
    'kappa;'    => "\xce\xba",
442
 
    'lArr;'     => "\xe2\x87\x90",
443
 
    'lambda;'   => "\xce\xbb",
444
 
    'lang;'     => "\xe3\x80\x88",
445
 
    'laquo'     => "\xc2\xab",
446
 
    'laquo;'    => "\xc2\xab",
447
 
    'larr;'     => "\xe2\x86\x90",
448
 
    'lceil;'    => "\xe2\x8c\x88",
449
 
    'ldquo;'    => "\xe2\x80\x9c",
450
 
    'le;'       => "\xe2\x89\xa4",
451
 
    'lfloor;'   => "\xe2\x8c\x8a",
452
 
    'lowast;'   => "\xe2\x88\x97",
453
 
    'loz;'      => "\xe2\x97\x8a",
454
 
    'lrm;'      => "\xe2\x80\x8e",
455
 
    'lsaquo;'   => "\xe2\x80\xb9",
456
 
    'lsquo;'    => "\xe2\x80\x98",
457
 
    'lt'        => '<',
458
 
    'lt;'       => '<',
459
 
    'macr'      => "\xc2\xaf",
460
 
    'macr;'     => "\xc2\xaf",
461
 
    'mdash;'    => "\xe2\x80\x94",
462
 
    'micro'     => "\xc2\xb5",
463
 
    'micro;'    => "\xc2\xb5",
464
 
    'middot'    => "\xc2\xb7",
465
 
    'middot;'   => "\xc2\xb7",
466
 
    'minus;'    => "\xe2\x88\x92",
467
 
    'mu;'       => "\xce\xbc",
468
 
    'nabla;'    => "\xe2\x88\x87",
469
 
    'nbsp'      => "\xc2\xa0",
470
 
    'nbsp;'     => "\xc2\xa0",
471
 
    'ndash;'    => "\xe2\x80\x93",
472
 
    'ne;'       => "\xe2\x89\xa0",
473
 
    'ni;'       => "\xe2\x88\x8b",
474
 
    'not'       => "\xc2\xac",
475
 
    'not;'      => "\xc2\xac",
476
 
    'notin;'    => "\xe2\x88\x89",
477
 
    'nsub;'     => "\xe2\x8a\x84",
478
 
    'ntilde'    => "\xc3\xb1",
479
 
    'ntilde;'   => "\xc3\xb1",
480
 
    'nu;'       => "\xce\xbd",
481
 
    'oacute'    => "\xc3\xb3",
482
 
    'oacute;'   => "\xc3\xb3",
483
 
    'ocirc'     => "\xc3\xb4",
484
 
    'ocirc;'    => "\xc3\xb4",
485
 
    'oelig;'    => "\xc5\x93",
486
 
    'ograve'    => "\xc3\xb2",
487
 
    'ograve;'   => "\xc3\xb2",
488
 
    'oline;'    => "\xe2\x80\xbe",
489
 
    'omega;'    => "\xcf\x89",
490
 
    'omicron;'  => "\xce\xbf",
491
 
    'oplus;'    => "\xe2\x8a\x95",
492
 
    'or;'       => "\xe2\x88\xa8",
493
 
    'ordf'      => "\xc2\xaa",
494
 
    'ordf;'     => "\xc2\xaa",
495
 
    'ordm'      => "\xc2\xba",
496
 
    'ordm;'     => "\xc2\xba",
497
 
    'oslash'    => "\xc3\xb8",
498
 
    'oslash;'   => "\xc3\xb8",
499
 
    'otilde'    => "\xc3\xb5",
500
 
    'otilde;'   => "\xc3\xb5",
501
 
    'otimes;'   => "\xe2\x8a\x97",
502
 
    'ouml'      => "\xc3\xb6",
503
 
    'ouml;'     => "\xc3\xb6",
504
 
    'para'      => "\xc2\xb6",
505
 
    'para;'     => "\xc2\xb6",
506
 
    'part;'     => "\xe2\x88\x82",
507
 
    'permil;'   => "\xe2\x80\xb0",
508
 
    'perp;'     => "\xe2\x8a\xa5",
509
 
    'phi;'      => "\xcf\x86",
510
 
    'pi;'       => "\xcf\x80",
511
 
    'piv;'      => "\xcf\x96",
512
 
    'plusmn'    => "\xc2\xb1",
513
 
    'plusmn;'   => "\xc2\xb1",
514
 
    'pound'     => "\xc2\xa3",
515
 
    'pound;'    => "\xc2\xa3",
516
 
    'prime;'    => "\xe2\x80\xb2",
517
 
    'prod;'     => "\xe2\x88\x8f",
518
 
    'prop;'     => "\xe2\x88\x9d",
519
 
    'psi;'      => "\xcf\x88",
520
 
    'quot'      => '"',
521
 
    'quot;'     => '"',
522
 
    'rArr;'     => "\xe2\x87\x92",
523
 
    'radic;'    => "\xe2\x88\x9a",
524
 
    'rang;'     => "\xe3\x80\x89",
525
 
    'raquo'     => "\xc2\xbb",
526
 
    'raquo;'    => "\xc2\xbb",
527
 
    'rarr;'     => "\xe2\x86\x92",
528
 
    'rceil;'    => "\xe2\x8c\x89",
529
 
    'rdquo;'    => "\xe2\x80\x9d",
530
 
    'real;'     => "\xe2\x84\x9c",
531
 
    'reg'       => "\xc2\xae",
532
 
    'reg;'      => "\xc2\xae",
533
 
    'rfloor;'   => "\xe2\x8c\x8b",
534
 
    'rho;'      => "\xcf\x81",
535
 
    'rlm;'      => "\xe2\x80\x8f",
536
 
    'rsaquo;'   => "\xe2\x80\xba",
537
 
    'rsquo;'    => "\xe2\x80\x99",
538
 
    'sbquo;'    => "\xe2\x80\x9a",
539
 
    'scaron;'   => "\xc5\xa1",
540
 
    'sdot;'     => "\xe2\x8b\x85",
541
 
    'sect'      => "\xc2\xa7",
542
 
    'sect;'     => "\xc2\xa7",
543
 
    'shy'       => "\xc2\xad",
544
 
    'shy;'      => "\xc2\xad",
545
 
    'sigma;'    => "\xcf\x83",
546
 
    'sigmaf;'   => "\xcf\x82",
547
 
    'sim;'      => "\xe2\x88\xbc",
548
 
    'spades;'   => "\xe2\x99\xa0",
549
 
    'sub;'      => "\xe2\x8a\x82",
550
 
    'sube;'     => "\xe2\x8a\x86",
551
 
    'sum;'      => "\xe2\x88\x91",
552
 
    'sup1'      => "\xc2\xb9",
553
 
    'sup1;'     => "\xc2\xb9",
554
 
    'sup2'      => "\xc2\xb2",
555
 
    'sup2;'     => "\xc2\xb2",
556
 
    'sup3'      => "\xc2\xb3",
557
 
    'sup3;'     => "\xc2\xb3",
558
 
    'sup;'      => "\xe2\x8a\x83",
559
 
    'supe;'     => "\xe2\x8a\x87",
560
 
    'szlig'     => "\xc3\x9f",
561
 
    'szlig;'    => "\xc3\x9f",
562
 
    'tau;'      => "\xcf\x84",
563
 
    'there4;'   => "\xe2\x88\xb4",
564
 
    'theta;'    => "\xce\xb8",
565
 
    'thetasym;' => "\xcf\x91",
566
 
    'thinsp;'   => "\xe2\x80\x89",
567
 
    'thorn'     => "\xc3\xbe",
568
 
    'thorn;'    => "\xc3\xbe",
569
 
    'tilde;'    => "\xcb\x9c",
570
 
    'times'     => "\xc3\x97",
571
 
    'times;'    => "\xc3\x97",
572
 
    'trade;'    => "\xe2\x84\xa2",
573
 
    'uArr;'     => "\xe2\x87\x91",
574
 
    'uacute'    => "\xc3\xba",
575
 
    'uacute;'   => "\xc3\xba",
576
 
    'uarr;'     => "\xe2\x86\x91",
577
 
    'ucirc'     => "\xc3\xbb",
578
 
    'ucirc;'    => "\xc3\xbb",
579
 
    'ugrave'    => "\xc3\xb9",
580
 
    'ugrave;'   => "\xc3\xb9",
581
 
    'uml'       => "\xc2\xa8",
582
 
    'uml;'      => "\xc2\xa8",
583
 
    'upsih;'    => "\xcf\x92",
584
 
    'upsilon;'  => "\xcf\x85",
585
 
    'uuml'      => "\xc3\xbc",
586
 
    'uuml;'     => "\xc3\xbc",
587
 
    'weierp;'   => "\xe2\x84\x98",
588
 
    'xi;'       => "\xce\xbe",
589
 
    'yacute'    => "\xc3\xbd",
590
 
    'yacute;'   => "\xc3\xbd",
591
 
    'yen'       => "\xc2\xa5",
592
 
    'yen;'      => "\xc2\xa5",
593
 
    'yuml'      => "\xc3\xbf",
594
 
    'yuml;'     => "\xc3\xbf",
595
 
    'zeta;'     => "\xce\xb6",
596
 
    'zwj;'      => "\xe2\x80\x8d",
597
 
    'zwnj;'     => "\xe2\x80\x8c"
598
 
  }
599
 
600
 
  ENCODINGS = %w[
601
 
      ansi_x3.4-1968
602
 
      iso-ir-6
603
 
      ansi_x3.4-1986
604
 
      iso_646.irv:1991
605
 
      ascii
606
 
      iso646-us
607
 
      us-ascii
608
 
      us
609
 
      ibm367
610
 
      cp367
611
 
      csascii
612
 
      ks_c_5601-1987
613
 
      korean
614
 
      iso-2022-kr
615
 
      csiso2022kr
616
 
      euc-kr
617
 
      iso-2022-jp
618
 
      csiso2022jp
619
 
      iso-2022-jp-2
620
 
      iso-ir-58
621
 
      chinese
622
 
      csiso58gb231280
623
 
      iso_8859-1:1987
624
 
      iso-ir-100
625
 
      iso_8859-1
626
 
      iso-8859-1
627
 
      latin1
628
 
      l1
629
 
      ibm819
630
 
      cp819
631
 
      csisolatin1
632
 
      iso_8859-2:1987
633
 
      iso-ir-101
634
 
      iso_8859-2
635
 
      iso-8859-2
636
 
      latin2
637
 
      l2
638
 
      csisolatin2
639
 
      iso_8859-3:1988
640
 
      iso-ir-109
641
 
      iso_8859-3
642
 
      iso-8859-3
643
 
      latin3
644
 
<