| 1 |
"""Functions to run cherrypy.response through Tidy or NSGML.""" |
|---|
| 2 |
|
|---|
| 3 |
import cgi |
|---|
| 4 |
import os |
|---|
| 5 |
import StringIO |
|---|
| 6 |
import traceback |
|---|
| 7 |
|
|---|
| 8 |
import cherrypy |
|---|
| 9 |
|
|---|
| 10 |
def tidy(temp_dir, tidy_path, strict_xml=False, errors_to_ignore=None, |
|---|
| 11 |
indent=False, wrap=False, warnings=True): |
|---|
| 12 |
"""Run cherrypy.response through Tidy. |
|---|
| 13 |
|
|---|
| 14 |
If either 'indent' or 'wrap' are specified, then response.body will be |
|---|
| 15 |
set to the output of tidy. Otherwise, only errors (including warnings, |
|---|
| 16 |
if warnings is True) will change the body. |
|---|
| 17 |
|
|---|
| 18 |
Note that we use the standalone Tidy tool rather than the python |
|---|
| 19 |
mxTidy module. This is because this module does not seem to be |
|---|
| 20 |
stable and it crashes on some HTML pages (which means that the |
|---|
| 21 |
server would also crash) |
|---|
| 22 |
""" |
|---|
| 23 |
response = cherrypy.response |
|---|
| 24 |
|
|---|
| 25 |
|
|---|
| 26 |
|
|---|
| 27 |
orig_body = response.collapse_body() |
|---|
| 28 |
|
|---|
| 29 |
fct = response.headers.get('Content-Type', '') |
|---|
| 30 |
ct = fct.split(';')[0] |
|---|
| 31 |
encoding = '' |
|---|
| 32 |
i = fct.find('charset=') |
|---|
| 33 |
if i != -1: |
|---|
| 34 |
encoding = fct[i + 8:] |
|---|
| 35 |
|
|---|
| 36 |
if ct == 'text/html': |
|---|
| 37 |
page_file = os.path.join(temp_dir, 'page.html') |
|---|
| 38 |
open(page_file, 'wb').write(orig_body) |
|---|
| 39 |
|
|---|
| 40 |
out_file = os.path.join(temp_dir, 'tidy.out') |
|---|
| 41 |
err_file = os.path.join(temp_dir, 'tidy.err') |
|---|
| 42 |
tidy_enc = encoding.replace('-', '') |
|---|
| 43 |
if tidy_enc: |
|---|
| 44 |
tidy_enc = '-' + tidy_enc |
|---|
| 45 |
|
|---|
| 46 |
strict_xml = ("", " -xml")[bool(strict_xml)] |
|---|
| 47 |
|
|---|
| 48 |
if indent: |
|---|
| 49 |
indent = ' -indent' |
|---|
| 50 |
else: |
|---|
| 51 |
indent = '' |
|---|
| 52 |
|
|---|
| 53 |
if wrap is False: |
|---|
| 54 |
wrap = '' |
|---|
| 55 |
else: |
|---|
| 56 |
try: |
|---|
| 57 |
wrap = ' -wrap %d' % int(tidyWrap) |
|---|
| 58 |
except: |
|---|
| 59 |
wrap = '' |
|---|
| 60 |
|
|---|
| 61 |
result = os.system('"%s" %s%s%s%s -f %s -o %s %s' % |
|---|
| 62 |
(tidy_path, tidy_enc, strict_xml, indent, wrap, |
|---|
| 63 |
err_file, out_file, page_file)) |
|---|
| 64 |
use_output = bool(indent or wrap) and not result |
|---|
| 65 |
if use_output: |
|---|
| 66 |
output = open(out_file, 'rb').read() |
|---|
| 67 |
|
|---|
| 68 |
new_errs = [] |
|---|
| 69 |
for err in open(err_file, 'rb').read().splitlines(): |
|---|
| 70 |
if (err.find('Error') != -1 or |
|---|
| 71 |
(warnings and err.find('Warning') != -1)): |
|---|
| 72 |
ignore = 0 |
|---|
| 73 |
for err_ign in errors_to_ignore or []: |
|---|
| 74 |
if err.find(err_ign) != -1: |
|---|
| 75 |
ignore = 1 |
|---|
| 76 |
break |
|---|
| 77 |
if not ignore: |
|---|
| 78 |
new_errs.append(err) |
|---|
| 79 |
|
|---|
| 80 |
if new_errs: |
|---|
| 81 |
response.body = wrong_content('<br />'.join(new_errs), orig_body) |
|---|
| 82 |
if response.headers.has_key("Content-Length"): |
|---|
| 83 |
|
|---|
| 84 |
del response.headers["Content-Length"] |
|---|
| 85 |
return |
|---|
| 86 |
elif strict_xml: |
|---|
| 87 |
|
|---|
| 88 |
|
|---|
| 89 |
from elementtree.ElementTree import parse |
|---|
| 90 |
tag_list = ['nbsp', 'quot'] |
|---|
| 91 |
for tag in tag_list: |
|---|
| 92 |
orig_body = orig_body.replace('&' + tag + ';', tag.upper()) |
|---|
| 93 |
|
|---|
| 94 |
if encoding: |
|---|
| 95 |
enctag = '<?xml version="1.0" encoding="%s"?>' % encoding |
|---|
| 96 |
orig_body = enctag + orig_body |
|---|
| 97 |
|
|---|
| 98 |
f = StringIO.StringIO(orig_body) |
|---|
| 99 |
try: |
|---|
| 100 |
tree = parse(f) |
|---|
| 101 |
except: |
|---|
| 102 |
|
|---|
| 103 |
body_file = StringIO.StringIO() |
|---|
| 104 |
traceback.print_exc(file = body_file) |
|---|
| 105 |
body_file = '<br />'.join(body_file.getvalue()) |
|---|
| 106 |
response.body = wrong_content(body_file, orig_body, "XML") |
|---|
| 107 |
if response.headers.has_key("Content-Length"): |
|---|
| 108 |
|
|---|
| 109 |
del response.headers["Content-Length"] |
|---|
| 110 |
return |
|---|
| 111 |
|
|---|
| 112 |
if use_output: |
|---|
| 113 |
response.body = [output] |
|---|
| 114 |
if response.headers.has_key("Content-Length"): |
|---|
| 115 |
|
|---|
| 116 |
del response.headers["Content-Length"] |
|---|
| 117 |
|
|---|
| 118 |
def html_space(text): |
|---|
| 119 |
"""Escape text, replacing space with nbsp and tab with 4 nbsp's.""" |
|---|
| 120 |
return cgi.escape(text).replace('\t', ' ').replace(' ', ' ') |
|---|
| 121 |
|
|---|
| 122 |
def html_break(text): |
|---|
| 123 |
"""Escape text, replacing newline with HTML br element.""" |
|---|
| 124 |
return cgi.escape(text).replace('\n', '<br />') |
|---|
| 125 |
|
|---|
| 126 |
def wrong_content(header, body, content_type="HTML"): |
|---|
| 127 |
output = ["Wrong %s:<br />%s<br />" % (content_type, html_break(header))] |
|---|
| 128 |
for i, line in enumerate(body.splitlines()): |
|---|
| 129 |
output.append("%03d - %s" % (i + 1, html_space(line))) |
|---|
| 130 |
return "<br />".join(output) |
|---|
| 131 |
|
|---|
| 132 |
|
|---|
| 133 |
def nsgmls(temp_dir, nsgmls_path, catalog_path, errors_to_ignore=None): |
|---|
| 134 |
response = cherrypy.response |
|---|
| 135 |
|
|---|
| 136 |
|
|---|
| 137 |
|
|---|
| 138 |
orig_body = response.collapse_body() |
|---|
| 139 |
|
|---|
| 140 |
fct = response.headers.get('Content-Type', '') |
|---|
| 141 |
ct = fct.split(';')[0] |
|---|
| 142 |
encoding = '' |
|---|
| 143 |
i = fct.find('charset=') |
|---|
| 144 |
if i != -1: |
|---|
| 145 |
encoding = fct[i + 8:] |
|---|
| 146 |
if ct == 'text/html': |
|---|
| 147 |
|
|---|
| 148 |
|
|---|
| 149 |
|
|---|
| 150 |
while True: |
|---|
| 151 |
i = orig_body.find('<script') |
|---|
| 152 |
if i == -1: |
|---|
| 153 |
break |
|---|
| 154 |
j = orig_body.find('</script>', i) |
|---|
| 155 |
if j == -1: |
|---|
| 156 |
break |
|---|
| 157 |
orig_body = orig_body[:i] + orig_body[j+9:] |
|---|
| 158 |
|
|---|
| 159 |
page_file = os.path.join(temp_dir, 'page.html') |
|---|
| 160 |
open(page_file, 'wb').write(orig_body) |
|---|
| 161 |
|
|---|
| 162 |
err_file = os.path.join(temp_dir, 'nsgmls.err') |
|---|
| 163 |
command = ('%s -c%s -f%s -s -E10 %s' % |
|---|
| 164 |
(nsgmls_path, catalog_path, err_file, page_file)) |
|---|
| 165 |
command = command.replace('\\', '/') |
|---|
| 166 |
os.system(command) |
|---|
| 167 |
errs = open(err_file, 'rb').read() |
|---|
| 168 |
|
|---|
| 169 |
new_errs = [] |
|---|
| 170 |
for err in errs.splitlines(): |
|---|
| 171 |
ignore = False |
|---|
| 172 |
for err_ign in errors_to_ignore or []: |
|---|
| 173 |
if err.find(err_ign) != -1: |
|---|
| 174 |
ignore = True |
|---|
| 175 |
break |
|---|
| 176 |
if not ignore: |
|---|
| 177 |
new_errs.append(err) |
|---|
| 178 |
|
|---|
| 179 |
if new_errs: |
|---|
| 180 |
response.body = wrong_content('<br />'.join(new_errs), orig_body) |
|---|
| 181 |
if response.headers.has_key("Content-Length"): |
|---|
| 182 |
|
|---|
| 183 |
del response.headers["Content-Length"] |
|---|
| 184 |
|
|---|