[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]
[tor-commits] [ooni-probe/master] Add support for detecting the charset of the HTML response body via the meta tag
commit 37564aa2876ba18516bb28e0e8eec7b489eea1c3
Author: Arturo Filastò <arturo@xxxxxxxxxxx>
Date: Thu Apr 14 17:16:46 2016 +0200
Add support for detecting the charset of the HTML response body via the meta tag
---
ooni/templates/httpt.py | 39 ++++++++++++++++++++++++++-------------
ooni/tests/test_templates.py | 10 ++++++++++
2 files changed, 36 insertions(+), 13 deletions(-)
diff --git a/ooni/templates/httpt.py b/ooni/templates/httpt.py
index 6ca486a..edab3fa 100644
--- a/ooni/templates/httpt.py
+++ b/ooni/templates/httpt.py
@@ -1,3 +1,4 @@
+import re
import random
from twisted.internet import defer
@@ -16,6 +17,7 @@ from ooni.utils.net import BodyReceiver, StringProducer, userAgents
from ooni.utils.trueheaders import TrueHeaders
from ooni.errors import handleAllFailures
+META_CHARSET_REGEXP = re.compile('<meta(?!\s*(?:name|value)\s*=)[^>]*?charset\s*=[\s"\']*([^\s"\'/>]*)')
class InvalidSocksProxyOption(Exception):
pass
@@ -35,6 +37,30 @@ class StreamListener(StreamListenerMixin):
except:
log.err("Tor Exit ip detection failed")
+
+
+def _representBody(body):
+ # XXX perhaps add support for decoding gzip in the future.
+ body = body.replace('\0', '')
+ decoded = False
+ charsets = ['ascii', 'utf-8']
+
+ # If we are able to detect the charset of body from the meta tag
+ # try to decode using that one first
+ charset = META_CHARSET_REGEXP.search(body, re.IGNORECASE)
+ if charset:
+ charsets.insert(0, charset.group(1))
+ for encoding in charsets:
+ try:
+ body = unicode(body, encoding)
+ decoded = True
+ break
+ except UnicodeDecodeError:
+ pass
+ if not decoded:
+ body = base64Dict(body)
+ return body
+
class HTTPTest(NetTestCase):
"""
A utility class for dealing with HTTP based testing. It provides methods to
@@ -128,19 +154,6 @@ class HTTPTest(NetTestCase):
represented_headers[name] = value[0]
return represented_headers
- def _representBody(body):
- # XXX perhaps add support for decoding gzip in the future.
- try:
- body = unicode(body, 'ascii')
- body = body.replace('\0', '')
- except UnicodeDecodeError:
- try:
- body = unicode(body, 'utf-8')
- body = body.replace('\0', '')
- except UnicodeDecodeError:
- body = base64Dict(body)
- return body
-
log.debug("Adding %s to report" % request)
request_headers = TrueHeaders(request['headers'])
session = {
diff --git a/ooni/tests/test_templates.py b/ooni/tests/test_templates.py
index bf05e56..931e052 100644
--- a/ooni/tests/test_templates.py
+++ b/ooni/tests/test_templates.py
@@ -44,6 +44,16 @@ class TestHTTPT(unittest.TestCase):
yield self.assertFailure(http_test.doRequest('http://invaliddomain/'), DNSLookupError)
assert http_test.report['requests'][0]['failure'] == 'dns_lookup_error'
+ def test_charset_detection(self):
+ no_charset_html = """
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html>
+<head>
+ <title>Foo</title>
+"""
+ with_charset_html = no_charset_html + '\n<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">'
+ self.assertEqual(httpt.META_CHARSET_REGEXP.search(no_charset_html), None)
+ self.assertEqual(httpt.META_CHARSET_REGEXP.search(with_charset_html).group(1), 'iso-8859-1')
class TestDNST(unittest.TestCase):
def test_represent_answer_a(self):
_______________________________________________
tor-commits mailing list
tor-commits@xxxxxxxxxxxxxxxxxxxx
https://lists.torproject.org/cgi-bin/mailman/listinfo/tor-commits