1
2 """
3 Retrieve all CSS stylesheets including embedded for a given URL.
4 Retrieve as StyleSheetList or save to disk.
5
6 TODO:
7 @import
8 save all
9
10 maybe use DOM 3 load/save?
11
12 logger class which handles all cases when no log is given...
13
14 saveto:
15 why does urllib2 hang?
16 """
17 __all__ = ['CSSCapture']
18 __docformat__ = 'restructuredtext'
19 __author__ = '$LastChangedBy: doerwalter $'
20 __date__ = '$LastChangedDate: 2007-08-21 17:18:10 +0200 (Di, 21 Aug 2007) $'
21 __version__ = '0.9.2a1, $LastChangedRevision: 263 $'
22
23 import errno
24 import HTMLParser
25 import logging
26 import os
27 import sys
28 import urllib
29 import urllib2
30 import urlparse
31 import xml.dom
32
33 import cssutils
34 from cssutils import css, stylesheets
35
36 try:
37 import encutils
38 except ImportError:
39 try:
40 import cssutils.encutils as encutils
41 except ImportError:
42 sys.exit("You need encutils from http://cthedot.de/encutils/")
43
44
46 """ parses given data for link and style elements """
47 curtag = u''
48 links = []
49
50 styles = []
51
52
54 return dict([(a.lower(), v.lower()) for a, v in attrs])
55
57 if tag == u'link':
58 attrs = self._lowerattrs(attrs)
59 if attrs.get(u'type', u'') == u'text/css':
60 self.links.append(attrs)
61
62 elif tag == u'style':
63 attrs = self._lowerattrs(attrs)
64 if attrs.get(u'type', u'') == u'text/css':
65 self.styles.append((attrs, u''))
66 self.curtag = tag
67 else:
68
69 self.curtag = u''
70
74
78
82
83
85 """
86 Retrieve all CSS stylesheets including embedded for a given URL.
87 Optional setting of User-Agent used for retrieval possible
88 to handle browser sniffing servers.
89
90 raises urllib2.HTTPError
91 """
92
93 - def __init__(self, ua=None, log=None, defaultloglevel=logging.INFO):
94 """
95 initialize a new Capture object
96
97 ua
98 init User-Agent to use for requests
99 log
100 supply a log object which is used instead of the default
101 log which writes to sys.stderr
102 defaultloglevel
103 constant of logging package which defines the level of the
104 default log if no explicit log given
105 """
106 self._ua = ua
107 self._parser = CSSCaptureHTMLParser()
108
109 if log:
110 self._log = log
111 else:
112 self._log = logging.getLogger('CSSCapture')
113 hdlr = logging.StreamHandler(sys.stderr)
114 formatter = logging.Formatter('%(levelname)s\t%(message)s')
115 hdlr.setFormatter(formatter)
116 self._log.addHandler(hdlr)
117 self._log.setLevel(defaultloglevel)
118 self._log.debug(u'(C) Using default log')
119
120
122 """
123 Does an HTTP request
124
125 Returns: (response, url)
126
127 url might have been changed by server due to redirects etc
128 """
129 self._log.debug(u'(C) _doRequest URL: %s' % url)
130
131 req = urllib2.Request(url)
132 if self._ua:
133 req.add_header('User-agent', self._ua)
134 self._log.info('(C) Using User-Agent: %s', self._ua)
135 try:
136 res = urllib2.urlopen(req)
137 except urllib2.HTTPError, e:
138 self._log.critical('(C) %s\n%s %s\n%s' % (
139 e.geturl(), e.code, e.msg, e.headers))
140 return None, None
141
142
143 if url != res.geturl():
144 url = res.geturl()
145 self._log.info('(C) URL retrieved: %s', url)
146
147 return res, url
148
149
150 - def _doImports(self, parentStyleSheet, baseurl=None):
151 """
152 handle all @import CSS stylesheet recusively
153 found CSS stylesheets are appended to stylesheetlist
154 """
155 for rule in parentStyleSheet.cssRules:
156 if rule.type == css.CSSRule.IMPORT_RULE:
157
158 href = urlparse.urljoin(baseurl, rule.href)
159 media = rule.media
160 res, href = self._doRequest(href)
161 if not res:
162 continue
163 cssText = res.read()
164 sheet = css.CSSStyleSheet(
165 href=href,
166 media=media,
167 parentStyleSheet=parentStyleSheet
168 )
169 self.stylesheetlist.append(sheet)
170
171 self._log.info(
172 '(C) - FOUND @import in: %s' % parentStyleSheet)
173 self._log.info('(C) * stylesheet : %s' % sheet)
174 self._log.info('(C) * full href : %s', href)
175 self._log.info('(C) * media : %s', media.mediaText)
176 self._log.debug('(C) * cssText :\n%s', cssText)
177
178 try:
179 sheet.cssText = cssText
180 except xml.dom.DOMException, e:
181 self._log.warn('(C) * CSSParser message:\n%s' % e)
182 self._doImports(sheet, baseurl=href)
183
184
186 """
187 parse text for stylesheets
188 fills stylesheetlist with all found StyleSheets
189
190 docurl
191 to build a full url of found StyleSheets @href
192 doctext
193 to parse
194 """
195 self._parser.feed(doctext)
196
197
198
199 for link in self._parser.links:
200
201 href = urlparse.urljoin(docurl, link.get(u'href', u''))
202 media = stylesheets.MediaList(link.get(u'media', u''))
203 res, href = self._doRequest(href)
204 if not res:
205 continue
206 cssText = res.read()
207 sheet = css.CSSStyleSheet(
208 href=href,
209 media=media,
210 title=link.get(u'title', u''),
211 )
212 self.stylesheetlist.append(sheet)
213
214 self._log.info('(C) - FOUND <link>: %s', link)
215 self._log.info('(C) * stylesheet: %s' % sheet)
216 self._log.info('(C) * full href : %s', href)
217 self._log.info('(C) * media : %s', media.mediaText)
218 self._log.debug('(C) * cssText :\n%s', cssText)
219
220 try:
221 sheet.cssText = cssText
222 except xml.dom.DOMException, e:
223 self._log.warn('(C) * CSSParser message:\n%s' % e)
224 self._doImports(sheet, baseurl=docurl)
225
226
227
228
229 for style in self._parser.styles:
230
231 stylemeta, cssText = style
232 media = stylesheets.MediaList(stylemeta.get(u'media', u''))
233 sheet = css.CSSStyleSheet(
234 href=None,
235 media=media,
236 title=stylemeta.get(u'title', u''),
237 )
238 self.stylesheetlist.append(sheet)
239
240 self._log.info('(C) - FOUND <style>: %s', stylemeta)
241 self._log.info('(C) * stylesheet : %s' % sheet)
242 self._log.info('(C) * media : %s', media.mediaText)
243 self._log.debug('(C) * cssText :\n%s', cssText)
244
245 try:
246 sheet.cssText = cssText
247 except xml.dom.DOMException, e:
248 self._log.warn('(C) * CSSParser message:\n%s' % e)
249 self._doImports(sheet, baseurl=docurl)
250
251
253 """
254 Capture stylesheets for the given url, any HTTPError is raised to
255 caller.
256
257 url
258 to capture CSS from
259 ua
260 User-Agent to use for requests
261
262 Returns StyleSheetList.
263 """
264 if ua is not None:
265 self._ua = ua
266
267
268 scheme, loc, path, query, fragment = urlparse.urlsplit(url)
269 self._filename = os.path.basename(path)
270
271 self.stylesheetlist = stylesheets.StyleSheetList()
272
273 self._log.debug('(C) CSSCapture.capture(%s)' % url)
274 self._log.info('(C) URL supplied: %s', url)
275
276
277 res, url = self._doRequest(url)
278 if not res:
279 sys.exit(1)
280 rawdoc = res.read()
281
282 encoding = encutils.getEncodingInfo(
283 res, rawdoc, log=self._log).encoding
284 self._log.info('(C) Using Encoding: %s', encoding)
285
286 doctext = unicode(rawdoc, encoding)
287
288
289 self._findStyleSheets(url, doctext)
290
291 return self.stylesheetlist
292
293
294 - def saveto(self, dir, saveparsed=False):
295 """
296 saves css in "dir" in the same layout as on the server
297 internal stylesheets are saved as "dir/__INLINE_STYLE__.html.css"
298
299 dir
300 directory to save files to
301 saveparsed
302 use literal CSS from server or use the parsed version
303
304 you may want to use the server version until CSSParser is more
305 stable or if you want to keep the stylesheet exactly as is
306 """
307 inlines = 0
308 for sheet in self.stylesheetlist:
309
310 url = sheet.href
311 if not url:
312 url = '%s_INLINE_%s.css' % (
313 self._filename, inlines)
314 inlines += 1
315
316
317 cssutils.ser.prefs.keepAllProperties=True
318 cssText = sheet.cssText
319
320
321
322
323 scheme, loc, path, query, fragment = urlparse.urlsplit(url)
324
325 if path and path.startswith('/'):
326 path = path[1:]
327 path = os.path.normpath(path)
328 path, fn = os.path.split(path)
329
330 savepath = os.path.join(dir, loc, path)
331 savefn = os.path.join(savepath, fn)
332
333 try:
334 os.makedirs(savepath)
335 except OSError, e:
336 if e.errno != errno.EEXIST:
337 raise e
338 self._log.debug('(C) Path "%s" already exists.', savepath)
339
340 open(savefn, 'w').write(cssText)
341 self._log.info('(C) Saving "%s"', savefn)
342
343
344 -def main(args=None):
345 import optparse
346
347 usage = "usage: %prog [options] URL"
348 parser = optparse.OptionParser(usage=usage)
349 parser.add_option('-u', '--useragent', action='store', dest='ua',
350 help='useragent to use for request of URL, default is urllib2s default')
351 parser.add_option('-s', '--saveto', action='store', dest='saveto',
352 help='saving retrieved files to "saveto", default to "_CSSCapture_SAVED"')
353 parser.add_option('-p', '--saveparsed', action='store_true', dest='saveparsed',
354 help='if given saves cssutils\' parsed files, otherwise original retrieved files')
355 parser.add_option('-n', '--notsave', action='store_true', dest='notsave',
356 help='if given files are NOT saved, only log is written')
357 parser.add_option('-d', '--debug', action='store_true', dest='debug',
358 help='show debug messages during capturing')
359 options, url = parser.parse_args()
360
361 if not url:
362 parser.error('no URL given')
363 else:
364 url = url[0]
365
366 if options.debug:
367 dll = logging.DEBUG
368 else:
369 dll = logging.INFO
370
371
372 c = CSSCapture(defaultloglevel=dll)
373
374 stylesheetlist = c.capture(url, ua=options.ua)
375
376 if options.notsave is None or not options.notsave:
377 if options.saveto:
378 saveto = options.saveto
379 else:
380 saveto = '_CSSCapture_SAVED'
381 c.saveto(saveto, saveparsed=options.saveparsed)
382 else:
383 for i, s in enumerate(stylesheetlist):
384 print i+1, '\tTitle: "%s", \n\thref: "%s"\n' % (s.title, s.href)
385
386
387 if __name__ == "__main__":
388 sys.exit(main())
389