1 """
2 An interface to html5lib.
3 """
4
5 import urllib
6 from html5lib import HTMLParser as _HTMLParser
7 from lxml import etree
8 from lxml.html import _contains_block_level_tag, XHTML_NAMESPACE
9 from lxml.html._html5builder import TreeBuilder
10
11
12 try:
13 _strings = basestring
14 except NameError:
15 _strings = (bytes, str)
16
17
19 """An html5lib HTML parser with lxml as tree."""
20
23
24
25 try:
26 from html5lib import XHTMLParser as _XHTMLParser
27 except ImportError:
28 pass
29 else:
31 """An html5lib XHTML Parser with lxml as tree."""
32
35
36 xhtml_parser = XHTMLParser()
37
38
44
45
55
56
59 """Parses several HTML elements, returning a list of elements.
60
61 The first item in the list may be a string. If no_leading_text is true,
62 then it will be an error if there is leading text, and it will always be
63 a list of only elements.
64
65 If `guess_charset` is `True` and the text was not unicode but a
66 bytestring, the `chardet` library will perform charset guessing on the
67 string.
68 """
69 if not isinstance(html, _strings):
70 raise TypeError('string required')
71
72 if parser is None:
73 parser = html_parser
74
75 children = parser.parseFragment(html, 'div', useChardet=guess_charset)
76 if children and isinstance(children[0], _strings):
77 if no_leading_text:
78 if children[0].strip():
79 raise etree.ParserError('There is leading text: %r' %
80 children[0])
81 del children[0]
82 return children
83
84
87 """Parses a single HTML element; it is an error if there is more than
88 one element, or if anything but whitespace precedes or follows the
89 element.
90
91 If create_parent is true (or is a tag name) then a parent node
92 will be created to encapsulate the HTML in a single element. In
93 this case, leading or trailing text is allowed.
94 """
95 if not isinstance(html, _strings):
96 raise TypeError('string required')
97
98 accept_leading_text = bool(create_parent)
99
100 elements = fragments_fromstring(
101 html, guess_charset=guess_charset, parser=parser,
102 no_leading_text=not accept_leading_text, **kw)
103
104 if create_parent:
105 if not isinstance(create_parent, basestring):
106 create_parent = 'div'
107 new_root = Element(create_parent)
108 if elements:
109 if isinstance(elements[0], basestring):
110 new_root.text = elements[0]
111 del elements[0]
112 new_root.extend(elements)
113 return new_root
114
115 if not elements:
116 raise etree.ParserError('No elements found')
117 if len(elements) > 1:
118 raise etree.ParserError('Multiple elements found')
119 result = elements[0]
120 if result.tail and result.tail.strip():
121 raise etree.ParserError('Element followed by text: %r' % result.tail)
122 result.tail = None
123 return result
124
125
126 -def fromstring(html, guess_charset=True, parser=None):
127 """Parse the html, returning a single element/document.
128
129 This tries to minimally parse the chunk of text, without knowing if it
130 is a fragment or a document.
131
132 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
133 """
134 if not isinstance(html, _strings):
135 raise TypeError('string required')
136 doc = document_fromstring(html, parser=parser,
137 guess_charset=guess_charset)
138
139
140 start = html[:50].lstrip().lower()
141 if start.startswith('<html') or start.startswith('<!doctype'):
142 return doc
143
144 head = _find_tag(doc, 'head')
145
146
147 if len(head):
148 return doc
149
150 body = _find_tag(doc, 'body')
151
152
153
154 if (len(body) == 1 and (not body.text or not body.text.strip())
155 and (not body[-1].tail or not body[-1].tail.strip())):
156 return body[0]
157
158
159
160
161 if _contains_block_level_tag(body):
162 body.tag = 'div'
163 else:
164 body.tag = 'span'
165 return body
166
167
168 -def parse(filename_url_or_file, guess_charset=True, parser=None):
169 """Parse a filename, URL, or file-like object into an HTML document
170 tree. Note: this returns a tree, not an element. Use
171 ``parse(...).getroot()`` to get the document root.
172 """
173 if parser is None:
174 parser = html_parser
175 if isinstance(filename_url_or_file, basestring):
176 fp = urllib.urlopen(filename_url_or_file)
177 else:
178 fp = filename_url_or_file
179 return parser.parse(fp, useChardet=guess_charset)
180
181
182 html_parser = HTMLParser()
183