sgmllib.py源码

  1 """A parser for SGML, using the derived class as a static DTD."""
  2 
  3 # XXX This only supports those SGML features used by HTML.
  4 
  5 # XXX There should be a way to distinguish between PCDATA (parsed
  6 # character data -- the normal case), RCDATA (replaceable character
  7 # data -- only char and entity references and end tags are special)
  8 # and CDATA (character data -- only end tags are special).  RCDATA is
  9 # not supported at all.
 10 
 11 
 12 from warnings import warnpy3k
 13 warnpy3k("the sgmllib module has been removed in Python 3.0",
 14          stacklevel=2)
 15 del warnpy3k
 16 
 17 import markupbase
 18 import re
 19 
 20 __all__ = ["SGMLParser", "SGMLParseError"]
 21 
 22 # Regular expressions used for parsing
 23 
 24 interesting = re.compile('[&<]')
 25 incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
 26                            '<([a-zA-Z][^<>]*|'
 27                               '/([a-zA-Z][^<>]*)?|'
 28                               '![^<>]*)?')
 29 
 30 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
 31 charref = re.compile('&#([0-9]+)[^0-9]')
 32 
 33 starttagopen = re.compile('<[>a-zA-Z]')
 34 shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
 35 shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
 36 piclose = re.compile('>')
 37 endbracket = re.compile('[<>]')
 38 tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
 39 attrfind = re.compile(
 40     r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
 41     r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
 42 
 43 
 44 class SGMLParseError(RuntimeError):
 45     """Exception raised for all parse errors."""
 46     pass
 47 
 48 
 49 # SGML parser base class -- find tags and call handler functions.
 50 # Usage: p = SGMLParser(); p.feed(data); ...; p.close().
 51 # The dtd is defined by deriving a class which defines methods
 52 # with special names to handle tags: start_foo and end_foo to handle
 53 # <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
 54 # (Tags are converted to lower case for this purpose.)  The data
 55 # between tags is passed to the parser by calling self.handle_data()
 56 # with some data as argument (the data may be split up in arbitrary
 57 # chunks).  Entity references are passed by calling
 58 # self.handle_entityref() with the entity reference as argument.
 59 
 60 class SGMLParser(markupbase.ParserBase):
 61     # Definition of entities -- derived classes may override
 62     entity_or_charref = re.compile('&(?:'
 63       '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
 64       ')(;?)')
 65 
 66     def __init__(self, verbose=0):
 67         """Initialize and reset this instance."""
 68         self.verbose = verbose
 69         self.reset()
 70 
 71     def reset(self):
 72         """Reset this instance. Loses all unprocessed data."""
 73         self.__starttag_text = None
 74         self.rawdata = ''
 75         self.stack = []
 76         self.lasttag = '???'
 77         self.nomoretags = 0
 78         self.literal = 0
 79         markupbase.ParserBase.reset(self)
 80 
 81     def setnomoretags(self):
 82         """Enter literal mode (CDATA) till EOF.
 83 
 84         Intended for derived classes only.
 85         """
 86         self.nomoretags = self.literal = 1
 87 
 88     def setliteral(self, *args):
 89         """Enter literal mode (CDATA).
 90 
 91         Intended for derived classes only.
 92         """
 93         self.literal = 1
 94 
 95     def feed(self, data):
 96         """Feed some data to the parser.
 97 
 98         Call this as often as you want, with as little or as much text
 99         as you want (may include '\n').  (This just saves the text,
100         all the processing is done by goahead().)
101         """
102 
103         self.rawdata = self.rawdata + data
104         self.goahead(0)
105 
106     def close(self):
107         """Handle the remaining data."""
108         self.goahead(1)
109 
110     def error(self, message):
111         raise SGMLParseError(message)
112 
113     # Internal -- handle data as far as reasonable.  May leave state
114     # and data to be processed by a subsequent call.  If 'end' is
115     # true, force handling all data as if followed by EOF marker.
116     def goahead(self, end):
117         rawdata = self.rawdata
118         i = 0
119         n = len(rawdata)
120         while i < n:
121             if self.nomoretags:
122                 self.handle_data(rawdata[i:n])
123                 i = n
124                 break
125             match = interesting.search(rawdata, i)
126             if match: j = match.start()
127             else: j = n
128             if i < j:
129                 self.handle_data(rawdata[i:j])
130             i = j
131             if i == n: break
132             if rawdata[i] == '<':
133                 if starttagopen.match(rawdata, i):
134                     if self.literal:
135                         self.handle_data(rawdata[i])
136                         i = i+1
137                         continue
138                     k = self.parse_starttag(i)
139                     if k < 0: break
140                     i = k
141                     continue
142                 if rawdata.startswith("</", i):
143                     k = self.parse_endtag(i)
144                     if k < 0: break
145                     i = k
146                     self.literal = 0
147                     continue
148                 if self.literal:
149                     if n > (i + 1):
150                         self.handle_data("<")
151                         i = i+1
152                     else:
153                         # incomplete
154                         break
155                     continue
156                 if rawdata.startswith("<!--", i):
157                         # Strictly speaking, a comment is --.*--
158                         # within a declaration tag <!...>.
159                         # This should be removed,
160                         # and comments handled only in parse_declaration.
161                     k = self.parse_comment(i)
162                     if k < 0: break
163                     i = k
164                     continue
165                 if rawdata.startswith("<?", i):
166                     k = self.parse_pi(i)
167                     if k < 0: break
168                     i = i+k
169                     continue
170                 if rawdata.startswith("<!", i):
171                     # This is some sort of declaration; in "HTML as
172                     # deployed," this should only be the document type
173                     # declaration ("<!DOCTYPE html...>").
174                     k = self.parse_declaration(i)
175                     if k < 0: break
176                     i = k
177                     continue
178             elif rawdata[i] == '&':
179                 if self.literal:
180                     self.handle_data(rawdata[i])
181                     i = i+1
182                     continue
183                 match = charref.match(rawdata, i)
184                 if match:
185                     name = match.group(1)
186                     self.handle_charref(name)
187                     i = match.end(0)
188                     if rawdata[i-1] != ';': i = i-1
189                     continue
190                 match = entityref.match(rawdata, i)
191                 if match:
192                     name = match.group(1)
193                     self.handle_entityref(name)
194                     i = match.end(0)
195                     if rawdata[i-1] != ';': i = i-1
196                     continue
197             else:
198                 self.error('neither < nor & ??')
199             # We get here only if incomplete matches but
200             # nothing else
201             match = incomplete.match(rawdata, i)
202             if not match:
203                 self.handle_data(rawdata[i])
204                 i = i+1
205                 continue
206             j = match.end(0)
207             if j == n:
208                 break # Really incomplete
209             self.handle_data(rawdata[i:j])
210             i = j
211         # end while
212         if end and i < n:
213             self.handle_data(rawdata[i:n])
214             i = n
215         self.rawdata = rawdata[i:]
216         # XXX if end: check for empty stack
217 
218     # Extensions for the DOCTYPE scanner:
219     _decl_otherchars = '='
220 
221     # Internal -- parse processing instr, return length or -1 if not terminated
222     def parse_pi(self, i):
223         rawdata = self.rawdata
224         if rawdata[i:i+2] != '<?':
225             self.error('unexpected call to parse_pi()')
226         match = piclose.search(rawdata, i+2)
227         if not match:
228             return -1
229         j = match.start(0)
230         self.handle_pi(rawdata[i+2: j])
231         j = match.end(0)
232         return j-i
233 
234     def get_starttag_text(self):
235         return self.__starttag_text
236 
237     # Internal -- handle starttag, return length or -1 if not terminated
238     def parse_starttag(self, i):
239         self.__starttag_text = None
240         start_pos = i
241         rawdata = self.rawdata
242         if shorttagopen.match(rawdata, i):
243             # SGML shorthand: <tag/data/ == <tag>data</tag>
244             # XXX Can data contain &... (entity or char refs)?
245             # XXX Can data contain < or > (tag characters)?
246             # XXX Can there be whitespace before the first /?
247             match = shorttag.match(rawdata, i)
248             if not match:
249                 return -1
250             tag, data = match.group(1, 2)
251             self.__starttag_text = '<%s/' % tag
252             tag = tag.lower()
253             k = match.end(0)
254             self.finish_shorttag(tag, data)
255             self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
256             return k
257         # XXX The following should skip matching quotes (' or ")
258         # As a shortcut way to exit, this isn't so bad, but shouldn't
259         # be used to locate the actual end of the start tag since the
260         # < or > characters may be embedded in an attribute value.
261         match = endbracket.search(rawdata, i+1)
262         if not match:
263             return -1
264         j = match.start(0)
265         # Now parse the data between i+1 and j into a tag and attrs
266         attrs = []
267         if rawdata[i:i+2] == '<>':
268             # SGML shorthand: <> == <last open tag seen>
269             k = j
270             tag = self.lasttag
271         else:
272             match = tagfind.match(rawdata, i+1)
273             if not match:
274                 self.error('unexpected call to parse_starttag')
275             k = match.end(0)
276             tag = rawdata[i+1:k].lower()
277             self.lasttag = tag
278         while k < j:
279             match = attrfind.match(rawdata, k)
280             if not match: break
281             attrname, rest, attrvalue = match.group(1, 2, 3)
282             if not rest:
283                 attrvalue = attrname
284             else:
285                 if (attrvalue[:1] == "'" == attrvalue[-1:] or
286                     attrvalue[:1] == '"' == attrvalue[-1:]):
287                     # strip quotes
288                     attrvalue = attrvalue[1:-1]
289                 attrvalue = self.entity_or_charref.sub(
290                     self._convert_ref, attrvalue)
291             attrs.append((attrname.lower(), attrvalue))
292             k = match.end(0)
293         if rawdata[j] == '>':
294             j = j+1
295         self.__starttag_text = rawdata[start_pos:j]
296         self.finish_starttag(tag, attrs)
297         return j
298 
299     # Internal -- convert entity or character reference
300     def _convert_ref(self, match):
301         if match.group(2):
302             return self.convert_charref(match.group(2)) or \
303                 '&#%s%s' % match.groups()[1:]
304         elif match.group(3):
305             return self.convert_entityref(match.group(1)) or \
306                 '&%s;' % match.group(1)
307         else:
308             return '&%s' % match.group(1)
309 
310     # Internal -- parse endtag
311     def parse_endtag(self, i):
312         rawdata = self.rawdata
313         match = endbracket.search(rawdata, i+1)
314         if not match:
315             return -1
316         j = match.start(0)
317         tag = rawdata[i+2:j].strip().lower()
318         if rawdata[j] == '>':
319             j = j+1
320         self.finish_endtag(tag)
321         return j
322 
323     # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
324     def finish_shorttag(self, tag, data):
325         self.finish_starttag(tag, [])
326         self.handle_data(data)
327         self.finish_endtag(tag)
328 
329     # Internal -- finish processing of start tag
330     # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
331     def finish_starttag(self, tag, attrs):
332         try:
333             method = getattr(self, 'start_' + tag)
334         except AttributeError:
335             try:
336                 method = getattr(self, 'do_' + tag)
337             except AttributeError:
338                 self.unknown_starttag(tag, attrs)
339                 return -1
340             else:
341                 self.handle_starttag(tag, method, attrs)
342                 return 0
343         else:
344             self.stack.append(tag)
345             self.handle_starttag(tag, method, attrs)
346             return 1
347 
348     # Internal -- finish processing of end tag
349     def finish_endtag(self, tag):
350         if not tag:
351             found = len(self.stack) - 1
352             if found < 0:
353                 self.unknown_endtag(tag)
354                 return
355         else:
356             if tag not in self.stack:
357                 try:
358                     method = getattr(self, 'end_' + tag)
359                 except AttributeError:
360                     self.unknown_endtag(tag)
361                 else:
362                     self.report_unbalanced(tag)
363                 return
364             found = len(self.stack)
365             for i in range(found):
366                 if self.stack[i] == tag: found = i
367         while len(self.stack) > found:
368             tag = self.stack[-1]
369             try:
370                 method = getattr(self, 'end_' + tag)
371             except AttributeError:
372                 method = None
373             if method:
374                 self.handle_endtag(tag, method)
375             else:
376                 self.unknown_endtag(tag)
377             del self.stack[-1]
378 
379     # Overridable -- handle start tag
380     def handle_starttag(self, tag, method, attrs):
381         method(attrs)
382 
383     # Overridable -- handle end tag
384     def handle_endtag(self, tag, method):
385         method()
386 
387     # Example -- report an unbalanced </...> tag.
388     def report_unbalanced(self, tag):
389         if self.verbose:
390             print '*** Unbalanced </' + tag + '>'
391             print '*** Stack:', self.stack
392 
393     def convert_charref(self, name):
394         """Convert character reference, may be overridden."""
395         try:
396             n = int(name)
397         except ValueError:
398             return
399         if not 0 <= n <= 127:
400             return
401         return self.convert_codepoint(n)
402 
403     def convert_codepoint(self, codepoint):
404         return chr(codepoint)
405 
406     def handle_charref(self, name):
407         """Handle character reference, no need to override."""
408         replacement = self.convert_charref(name)
409         if replacement is None:
410             self.unknown_charref(name)
411         else:
412             self.handle_data(replacement)
413 
414     # Definition of entities -- derived classes may override
415     entitydefs = \
416             {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
417 
418     def convert_entityref(self, name):
419         """Convert entity references.
420 
421         As an alternative to overriding this method; one can tailor the
422         results by setting up the self.entitydefs mapping appropriately.
423         """
424         table = self.entitydefs
425         if name in table:
426             return table[name]
427         else:
428             return
429 
430     def handle_entityref(self, name):
431         """Handle entity references, no need to override."""
432         replacement = self.convert_entityref(name)
433         if replacement is None:
434             self.unknown_entityref(name)
435         else:
436             self.handle_data(replacement)
437 
438     # Example -- handle data, should be overridden
439     def handle_data(self, data):
440         pass
441 
442     # Example -- handle comment, could be overridden
443     def handle_comment(self, data):
444         pass
445 
446     # Example -- handle declaration, could be overridden
447     def handle_decl(self, decl):
448         pass
449 
450     # Example -- handle processing instruction, could be overridden
451     def handle_pi(self, data):
452         pass
453 
454     # To be overridden -- handlers for unknown objects
455     def unknown_starttag(self, tag, attrs): pass
456     def unknown_endtag(self, tag): pass
457     def unknown_charref(self, ref): pass
458     def unknown_entityref(self, ref): pass
459 
460 
461 class TestSGMLParser(SGMLParser):
462 
463     def __init__(self, verbose=0):
464         self.testdata = ""
465         SGMLParser.__init__(self, verbose)
466 
467     def handle_data(self, data):
468         self.testdata = self.testdata + data
469         if len(repr(self.testdata)) >= 70:
470             self.flush()
471 
472     def flush(self):
473         data = self.testdata
474         if data:
475             self.testdata = ""
476             print 'data:', repr(data)
477 
478     def handle_comment(self, data):
479         self.flush()
480         r = repr(data)
481         if len(r) > 68:
482             r = r[:32] + '...' + r[-32:]
483         print 'comment:', r
484 
485     def unknown_starttag(self, tag, attrs):
486         self.flush()
487         if not attrs:
488             print 'start tag: <' + tag + '>'
489         else:
490             print 'start tag: <' + tag,
491             for name, value in attrs:
492                 print name + '=' + '"' + value + '"',
493             print '>'
494 
495     def unknown_endtag(self, tag):
496         self.flush()
497         print 'end tag: </' + tag + '>'
498 
499     def unknown_entityref(self, ref):
500         self.flush()
501         print '*** unknown entity ref: &' + ref + ';'
502 
503     def unknown_charref(self, ref):
504         self.flush()
505         print '*** unknown char ref: &#' + ref + ';'
506 
507     def unknown_decl(self, data):
508         self.flush()
509         print '*** unknown decl: [' + data + ']'
510 
511     def close(self):
512         SGMLParser.close(self)
513         self.flush()
514 
515 
516 def test(args = None):
517     import sys
518 
519     if args is None:
520         args = sys.argv[1:]
521 
522     if args and args[0] == '-s':
523         args = args[1:]
524         klass = SGMLParser
525     else:
526         klass = TestSGMLParser
527 
528     if args:
529         file = args[0]
530     else:
531         file = 'test.html'
532 
533     if file == '-':
534         f = sys.stdin
535     else:
536         try:
537             f = open(file, 'r')
538         except IOError, msg:
539             print file, ":", msg
540             sys.exit(1)
541 
542     data = f.read()
543     if f is not sys.stdin:
544         f.close()
545 
546     x = klass()
547     for c in data:
548         x.feed(c)
549     x.close()
550 
551 
552 if __name__ == '__main__':
553     test()
你给了我眼睛，却不给我光明。
相关阅读:
为开源项目 go-gin-api 增加后台任务模块
 将多行数据以',' 进行分隔
 syslog中的“（CRON）信息（未安装MTA，丢弃输出）”错误，crontab定时任务失效
 为什么我不推荐大家去外包公司？
Linux永久修改系统时间
 云数据库 Redis 暂时不支持外网访问
 Nginx中worker connections问题的解决方法大量用户502
入手
 nginx 之$proxy_host｜$host｜$http_host区别
 grpc-golang入门
原文地址：https://www.cnblogs.com/wwb0111/p/3113659.html