Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# http://code.activestate.com/recipes/65125-xml-lexing-shallow-parsing/ 

2# by Paul Prescod 

3# licensed under the PSF License 

4# 

5# modified to capture all non-overlapping parts of tokens 

6 

7import re 

8 

9try: 

10 str = unicode 

11except NameError: 

12 pass 

13 

14class recollector: 

15 def __init__(self): 

16 self.res = {} 

17 

18 def add(self, name, reg ): 

19 re.compile(reg) # check that it is valid 

20 self.res[name] = reg % self.res 

21 

22collector = recollector() 

23a = collector.add 

24 

25a("TextSE", "[^<]+") 

26a("UntilHyphen", "[^-]*-") 

27a("Until2Hyphens", "%(UntilHyphen)s(?:[^-]%(UntilHyphen)s)*-") 

28a("CommentCE", "%(Until2Hyphens)s>?") 

29a("UntilRSBs", "[^\\]]*](?:[^\\]]+])*]+") 

30a("CDATA_CE", "%(UntilRSBs)s(?:[^\\]>]%(UntilRSBs)s)*>" ) 

31a("S", "[ \\n\\t\\r]+") 

32a("Simple", "[^\"'>/]+") 

33a("NameStrt", "[A-Za-z_:@]|[^\\x00-\\x7F]") 

34a("NameChar", "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]") 

35a("Name", "(?:%(NameStrt)s)(?:%(NameChar)s)*") 

36a("QuoteSE", "\"[^\"]*\"|'[^']*'") 

37a("DT_IdentSE" , "%(S)s%(Name)s(?:%(S)s(?:%(Name)s|%(QuoteSE)s))*" ) 

38a("MarkupDeclCE" , "(?:[^\\]\"'><]+|%(QuoteSE)s)*>" ) 

39a("S1", "[\\n\\r\\t ]") 

40a("UntilQMs", "[^?]*\\?+") 

41a("PI_Tail" , "\\?>|%(S1)s%(UntilQMs)s(?:[^>?]%(UntilQMs)s)*>" ) 

42a("DT_ItemSE", 

43 "<(?:!(?:--%(Until2Hyphens)s>|[^-]%(MarkupDeclCE)s)|" 

44 "\\?%(Name)s(?:%(PI_Tail)s))|%%%(Name)s;|%(S)s" 

45) 

46a("DocTypeCE" , 

47"%(DT_IdentSE)s(?:%(S)s)?(?:\\[(?:%(DT_ItemSE)s)*](?:%(S)s)?)?>?" ) 

48a("DeclCE", 

49 "--(?:%(CommentCE)s)?|\\[CDATA\\[(?:%(CDATA_CE)s)?|" 

50 "DOCTYPE(?:%(DocTypeCE)s)?") 

51a("PI_CE", "%(Name)s(?:%(PI_Tail)s)?") 

52a("EndTagCE", "%(Name)s(?:%(S)s)?>?") 

53a("AttValSE", r"\"[^\"]*\"|'[^']*'|[^\s=<>`]+") 

54a("ElemTagCE", 

55 "(%(Name)s)(?:(%(S)s)(%(Name)s)(((?:%(S)s)?=(?:%(S)s)?)" 

56 "(?:%(AttValSE)s|%(Simple)s)|(?!(?:%(S)s)?=)))*(?:%(S)s)?(/?>)?") 

57a("MarkupSPE", 

58 "<(?:!(?:%(DeclCE)s)?|" 

59 "\\?(?:%(PI_CE)s)?|/(?:%(EndTagCE)s)?|(?:%(ElemTagCE)s)?)") 

60a("XML_SPE", "%(TextSE)s|%(MarkupSPE)s") 

61a("XML_MARKUP_ONLY_SPE", "%(MarkupSPE)s") 

62a("ElemTagSPE", "<|%(Name)s") 

63 

64re_xml_spe = re.compile(collector.res['XML_SPE']) 

65re_markup_only_spe = re.compile(collector.res['XML_MARKUP_ONLY_SPE']) 

66 

67 

68def iter_xml(body, filename=None): 

69 for match in re_xml_spe.finditer(body): 

70 string = match.group() 

71 pos = match.start() 

72 yield Token(string, pos, body, filename) 

73 

74 

75def iter_text(body, filename=None): 

76 yield Token(body, 0, body, filename) 

77 

78 

79class Token(str): 

80 __slots__ = "pos", "source", "filename" 

81 

82 def __new__(cls, string, pos=0, source=None, filename=None): 

83 inst = str.__new__(cls, string) 

84 inst.pos = pos 

85 inst.source = source 

86 inst.filename = filename or "" 

87 return inst 

88 

89 def __getslice__(self, i, j): 

90 slice = str.__getslice__(self, i, j) 

91 return Token(slice, self.pos + i, self.source, self.filename) 

92 

93 def __getitem__(self, index): 

94 s = str.__getitem__(self, index) 

95 if isinstance(index, slice): 

96 return Token( 

97 s, self.pos + (index.start or 0), self.source, self.filename) 

98 return s 

99 

100 def __add__(self, other): 

101 if other is None: 

102 return self 

103 

104 return Token( 

105 str.__add__(self, other), self.pos, self.source, self.filename) 

106 

107 def __eq__(self, other): 

108 return str.__eq__(self, other) 

109 

110 def __hash__(self): 

111 return str.__hash__(self) 

112 

113 def replace(self, *args): 

114 s = str.replace(self, *args) 

115 return Token(s, self.pos, self.source, self.filename) 

116 

117 def split(self, *args): 

118 l = str.split(self, *args) 

119 pos = self.pos 

120 for i, s in enumerate(l): 

121 l[i] = Token(s, pos, self.source, self.filename) 

122 pos += len(s) 

123 return l 

124 

125 def strip(self, *args): 

126 return self.lstrip(*args).rstrip(*args) 

127 

128 def lstrip(self, *args): 

129 s = str.lstrip(self, *args) 

130 return Token( 

131 s, self.pos + len(self) - len(s), self.source, self.filename) 

132 

133 def rstrip(self, *args): 

134 s = str.rstrip(self, *args) 

135 return Token(s, self.pos, self.source, self.filename) 

136 

137 @property 

138 def location(self): 

139 if self.source is None: 

140 return 0, self.pos 

141 

142 body = self.source[:self.pos] 

143 line = body.count('\n') 

144 return line + 1, self.pos - body.rfind('\n', 0) - 1