[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]
[tor-commits] [stem/master] Using binary mode when reading descriptors
commit 3afa4346d102d99eab93929be738cdb594105d9b
Author: Damian Johnson <atagar@xxxxxxxxxxxxxx>
Date: Thu Jan 31 09:44:18 2013 -0800
Using binary mode when reading descriptors
Now Damian, repleat after me: text mode is bad.
In python 2.x text mode and binary mode seem to be indistinguishable, but in
python 3 there's one tiny little difference: text mode is around 33x slower.
The integ test that read the cached-consensus took over five minutes (by
comparison to ten seconds with python 2.7), and in one case simply hung for
twenty minutes before I killed it.
I'm not aware of any disadvantage to using binary mode, so opting for that.
---
stem/descriptor/__init__.py | 11 +++++-
stem/descriptor/reader.py | 7 +---
stem/descriptor/server_descriptor.py | 2 +-
stem/util/str_tools.py | 1 +
test/integ/descriptor/__init__.py | 15 -------
test/integ/descriptor/extrainfo_descriptor.py | 8 ++--
test/integ/descriptor/networkstatus.py | 18 ++++----
test/integ/descriptor/server_descriptor.py | 55 ++++++++-----------------
test/unit/descriptor/server_descriptor.py | 3 +-
9 files changed, 45 insertions(+), 75 deletions(-)
diff --git a/stem/descriptor/__init__.py b/stem/descriptor/__init__.py
index bebcdcc..788b442 100644
--- a/stem/descriptor/__init__.py
+++ b/stem/descriptor/__init__.py
@@ -83,7 +83,7 @@ def parse_file(descriptor_file, descriptor_type = None, path = None, validate =
tordnsel 1.0 **unsupported**
===================================== =====
- If you're using python 3 then beware of the open() function's universal
+ If you're using **python 3** then beware of the open() function's universal
newline translation. By default open() converts all common line endings (NL,
CR, and CRNL) into NL. In some edge cases this can cause us to misparse
content. To disable newline translation set the **newline** to an empty
@@ -93,6 +93,15 @@ def parse_file(descriptor_file, descriptor_type = None, path = None, validate =
my_descriptor_file = open(descrptor_path, newline='')
+ What's more, python 3's read performance in **text mode** is deplorably bad
+ (my testing with python 3.2 shows it to be 33x slower). Using **binary mode**
+ is strongly suggested. If you do this then newline translation is
+ automatically disabled...
+
+ ::
+
+ my_descriptor_file = open(descriptor_path, 'rb')
+
:param file descriptor_file: opened file with the descriptor contents
:param str descriptor_type: `descriptor type <https://metrics.torproject.org/formats.html#descriptortypes>`_, this is guessed if not provided
:param str path: absolute path to the file's location on disk
diff --git a/stem/descriptor/reader.py b/stem/descriptor/reader.py
index ef35fc0..f00e0aa 100644
--- a/stem/descriptor/reader.py
+++ b/stem/descriptor/reader.py
@@ -513,12 +513,7 @@ class DescriptorReader(object):
try:
self._notify_read_listeners(target)
- if stem.prereq.is_python_3():
- target_file = open(target, newline = '')
- else:
- target_file = open(target)
-
- with target_file as target_file:
+ with open(target, 'rb') as target_file:
for desc in stem.descriptor.parse_file(target_file, validate = self._validate, path = target):
if self._is_stopped.isSet():
return
diff --git a/stem/descriptor/server_descriptor.py b/stem/descriptor/server_descriptor.py
index 2376a56..95a4ef0 100644
--- a/stem/descriptor/server_descriptor.py
+++ b/stem/descriptor/server_descriptor.py
@@ -124,7 +124,7 @@ def _parse_file(descriptor_file, is_bridge = False, validate = True):
if descriptor_content:
# strip newlines from annotations
- annotations = map(str.strip, annotations)
+ annotations = map(unicode.strip, annotations)
descriptor_text = "".join(descriptor_content)
diff --git a/stem/util/str_tools.py b/stem/util/str_tools.py
index 37756ed..2733875 100644
--- a/stem/util/str_tools.py
+++ b/stem/util/str_tools.py
@@ -72,6 +72,7 @@ else:
else:
return msg
+
def to_bytes(msg):
"""
Provides the ASCII bytes for the given string. This is purely to provide
diff --git a/test/integ/descriptor/__init__.py b/test/integ/descriptor/__init__.py
index 31421f1..58f9f81 100644
--- a/test/integ/descriptor/__init__.py
+++ b/test/integ/descriptor/__init__.py
@@ -12,8 +12,6 @@ __all__ = [
import os
-import stem.prereq
-
DESCRIPTOR_TEST_DATA = os.path.join(os.path.dirname(__file__), "data")
@@ -23,16 +21,3 @@ def get_resource(filename):
"""
return os.path.join(DESCRIPTOR_TEST_DATA, filename)
-
-
-def open_desc(filename, absolute = False):
- """
- Provides the file for a given descriptor in our data directory.
- """
-
- path = filename if absolute else get_resource(filename)
-
- if stem.prereq.is_python_3():
- return open(path, newline = '')
- else:
- return open(path)
diff --git a/test/integ/descriptor/extrainfo_descriptor.py b/test/integ/descriptor/extrainfo_descriptor.py
index 0bcc23f..4cb9803 100644
--- a/test/integ/descriptor/extrainfo_descriptor.py
+++ b/test/integ/descriptor/extrainfo_descriptor.py
@@ -12,7 +12,7 @@ import stem.descriptor.extrainfo_descriptor
import test.runner
from stem.descriptor.extrainfo_descriptor import DirResponse
-from test.integ.descriptor import open_desc
+from test.integ.descriptor import get_resource
class TestExtraInfoDescriptor(unittest.TestCase):
@@ -21,7 +21,7 @@ class TestExtraInfoDescriptor(unittest.TestCase):
Parses and checks our results against an extrainfo descriptor from metrics.
"""
- descriptor_file = open_desc("extrainfo_relay_descriptor")
+ descriptor_file = open(get_resource("extrainfo_relay_descriptor"), 'rb')
descriptor_file.readline() # strip header
descriptor_contents = descriptor_file.read()
descriptor_file.close()
@@ -70,7 +70,7 @@ k0d2aofcVbHr4fPQOSST0LXDrhFl5Fqo5um296zpJGvRUeO6S44U/EfJAGShtqWw
metrics.
"""
- descriptor_file = open_desc("extrainfo_bridge_descriptor")
+ descriptor_file = open(get_resource("extrainfo_bridge_descriptor"), 'rb')
descriptor_file.readline() # strip header
descriptor_contents = descriptor_file.read()
descriptor_file.close()
@@ -146,7 +146,7 @@ k0d2aofcVbHr4fPQOSST0LXDrhFl5Fqo5um296zpJGvRUeO6S44U/EfJAGShtqWw
test.runner.skip(self, "(no cached descriptors)")
return
- with open_desc(descriptor_path, absolute = True) as descriptor_file:
+ with open(descriptor_path, 'rb') as descriptor_file:
for desc in stem.descriptor.extrainfo_descriptor._parse_file(descriptor_file):
unrecognized_lines = desc.get_unrecognized_lines()
diff --git a/test/integ/descriptor/networkstatus.py b/test/integ/descriptor/networkstatus.py
index 19c3d8f..df676b4 100644
--- a/test/integ/descriptor/networkstatus.py
+++ b/test/integ/descriptor/networkstatus.py
@@ -15,7 +15,7 @@ import stem.descriptor.networkstatus
import stem.version
import test.runner
-from test.integ.descriptor import get_resource, open_desc
+from test.integ.descriptor import get_resource
class TestNetworkStatus(unittest.TestCase):
@@ -42,7 +42,7 @@ class TestNetworkStatus(unittest.TestCase):
return
count = 0
- with open_desc(consensus_path, absolute = True) as descriptor_file:
+ with open(consensus_path, 'rb') as descriptor_file:
document_type = stem.descriptor.networkstatus.NetworkStatusDocumentV3
for router in stem.descriptor.networkstatus._parse_file(descriptor_file, document_type):
@@ -89,7 +89,7 @@ class TestNetworkStatus(unittest.TestCase):
return
count = 0
- with open_desc(consensus_path, absolute = True) as descriptor_file:
+ with open(consensus_path, 'rb') as descriptor_file:
document_type = stem.descriptor.networkstatus.NetworkStatusDocumentV3
for router in stem.descriptor.networkstatus._parse_file(descriptor_file, document_type, is_microdescriptor = True):
@@ -120,7 +120,7 @@ class TestNetworkStatus(unittest.TestCase):
consensus_path = get_resource("metrics_consensus")
for specify_type in (True, False):
- with open_desc(consensus_path, absolute = True) as descriptor_file:
+ with open(consensus_path, 'rb') as descriptor_file:
if specify_type:
descriptors = stem.descriptor.parse_file(descriptor_file, "network-status-consensus-3 1.0", path = consensus_path)
else:
@@ -142,7 +142,7 @@ class TestNetworkStatus(unittest.TestCase):
consensus_path = get_resource("bridge_network_status")
- with open_desc(consensus_path, absolute = True) as descriptor_file:
+ with open(consensus_path, 'rb') as descriptor_file:
descriptors = stem.descriptor.parse_file(descriptor_file, path = consensus_path)
router = next(descriptors)
@@ -245,7 +245,7 @@ mfWcW0b+jsrXcJoCxV5IrwCDF3u1aC3diwZY6yiG186pwWbOwE41188XI2DeYPwE
I/TJmV928na7RLZe2mGHCAW3VQOvV+QkCfj05VZ8CsY=
-----END SIGNATURE-----"""
- with open_desc("cached-consensus") as descriptor_file:
+ with open(get_resource("cached-consensus"), 'rb') as descriptor_file:
document = stem.descriptor.networkstatus.NetworkStatusDocumentV3(descriptor_file.read(), default_params = False)
self.assertEquals(3, document.version)
@@ -316,7 +316,7 @@ nTA+fD8JQqpPtb8b0SnG9kwy75eS//sRu7TErie2PzGMxrf9LH0LAgMBAAE=
TpQQk3nNQF8z6UIvdlvP+DnJV4izWVkQEZgUZgIVM0E=
-----END SIGNATURE-----"""
- with open_desc("cached-consensus-v2") as descriptor_file:
+ with open(get_resource("cached-consensus-v2"), 'rb') as descriptor_file:
descriptor_file.readline() # strip header
document = stem.descriptor.networkstatus.NetworkStatusDocumentV2(descriptor_file.read())
@@ -381,7 +381,7 @@ TpQQk3nNQF8z6UIvdlvP+DnJV4izWVkQEZgUZgIVM0E=
vote_path = get_resource("metrics_vote")
- with open_desc(vote_path, absolute = True) as descriptor_file:
+ with open(vote_path, 'rb') as descriptor_file:
descriptors = stem.descriptor.parse_file(descriptor_file, path = vote_path)
router = next(descriptors)
@@ -443,7 +443,7 @@ JZ/1HL9sHyZfo6bwaC6YSM9PNiiY6L7rnGpS7UkHiFI+M96VCMorvjm5YPs3FioJ
DnN5aFtYKiTc19qIC7Nmo+afPdDEf0MlJvEOP5EWl3w=
-----END SIGNATURE-----"""
- with open_desc("vote") as descriptor_file:
+ with open(get_resource("vote"), 'rb') as descriptor_file:
document = stem.descriptor.networkstatus.NetworkStatusDocumentV3(descriptor_file.read(), default_params = False)
self.assertEquals(3, document.version)
diff --git a/test/integ/descriptor/server_descriptor.py b/test/integ/descriptor/server_descriptor.py
index 549cea8..c1cd742 100644
--- a/test/integ/descriptor/server_descriptor.py
+++ b/test/integ/descriptor/server_descriptor.py
@@ -12,11 +12,10 @@ import stem.control
import stem.descriptor
import stem.descriptor.server_descriptor
import stem.exit_policy
-import stem.util.str_tools
import stem.version
import test.runner
-from test.integ.descriptor import open_desc
+from test.integ.descriptor import get_resource
class TestServerDescriptor(unittest.TestCase):
@@ -25,10 +24,7 @@ class TestServerDescriptor(unittest.TestCase):
Parses and checks our results against a server descriptor from metrics.
"""
- descriptor_file = open_desc("example_descriptor")
- descriptor_file.readline() # strip header
- descriptor_contents = descriptor_file.read()
- descriptor_file.close()
+ descriptor_file = open(get_resource("example_descriptor"), 'rb')
expected_family = [
"$0CE3CFB1E9CC47B63EA8869813BF6FAB7D4540C1",
@@ -59,7 +55,7 @@ dskLSPz8beUW7bzwDjR6EVNGpyoZde83Ejvau+5F2c6cGnlu91fiZN3suE88iE6e
Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4=
-----END SIGNATURE-----"""
- desc = stem.descriptor.server_descriptor.RelayDescriptor(descriptor_contents)
+ desc = next(stem.descriptor.parse_file(descriptor_file, "server-descriptor 1.0"))
self.assertEquals("caerSidi", desc.nickname)
self.assertEquals("A7569A83B5706AB1B1A9CB52EFF7D2D32E4553EB", desc.fingerprint)
self.assertEquals("71.35.133.197", desc.address)
@@ -95,7 +91,7 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4=
Parses and checks our results against a server descriptor from metrics.
"""
- with open_desc("metrics_server_desc_multiple") as descriptor_file:
+ with open(get_resource("metrics_server_desc_multiple"), 'rb') as descriptor_file:
descriptors = list(stem.descriptor.parse_file(descriptor_file, "server-descriptor 1.0"))
self.assertEquals(2, len(descriptors))
@@ -111,12 +107,9 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4=
Parses a relay server descriptor from 2005.
"""
- descriptor_file = open_desc("old_descriptor")
- descriptor_file.readline() # strip header
- descriptor_contents = descriptor_file.read()
- descriptor_file.close()
+ descriptor_file = open(get_resource("old_descriptor"), 'rb')
- desc = stem.descriptor.server_descriptor.RelayDescriptor(descriptor_contents)
+ desc = next(stem.descriptor.parse_file(descriptor_file, "server-descriptor 1.0"))
self.assertEquals("krypton", desc.nickname)
self.assertEquals("3E2F63E2356F52318B536A12B6445373808A5D6C", desc.fingerprint)
self.assertEquals("212.37.39.59", desc.address)
@@ -173,8 +166,8 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4=
test.runner.skip(self, "(no cached descriptors)")
return
- with open_desc(descriptor_path, absolute = True) as descriptor_file:
- for desc in stem.descriptor.server_descriptor._parse_file(descriptor_file):
+ with open(descriptor_path, 'rb') as descriptor_file:
+ for desc in stem.descriptor.parse_file(descriptor_file, "server-descriptor 1.0"):
# the following attributes should be deprecated, and not appear in the wild
self.assertEquals(None, desc.read_history_end)
self.assertEquals(None, desc.write_history_end)
@@ -197,14 +190,11 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4=
Parses a descriptor with non-ascii content.
"""
- descriptor_file = open_desc("non-ascii_descriptor")
- descriptor_file.readline() # strip header
- descriptor_contents = stem.util.str_tools.to_unicode(descriptor_file.read())
- descriptor_file.close()
+ descriptor_file = open(get_resource("non-ascii_descriptor"), 'rb')
expected_contact = b"2048R/F171EC1F Johan Bl\xc3\xa5b\xc3\xa4ck \xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf".decode("utf-8", "replace")
- desc = stem.descriptor.server_descriptor.RelayDescriptor(descriptor_contents)
+ desc = next(stem.descriptor.parse_file(descriptor_file, "server-descriptor 1.0"))
self.assertEquals("torrelay389752132", desc.nickname)
self.assertEquals("5D47E91A1F7421A4E3255F4D04E534E9A21407BB", desc.fingerprint)
self.assertEquals("130.243.230.116", desc.address)
@@ -237,12 +227,8 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4=
returns ('\r' entries).
"""
- descriptor_file = open_desc("cr_in_contact_line")
- descriptor_file.readline() # strip header
- descriptor_contents = descriptor_file.read()
- descriptor_file.close()
-
- desc = stem.descriptor.server_descriptor.RelayDescriptor(descriptor_contents)
+ descriptor_file = open(get_resource("cr_in_contact_line"), 'rb')
+ desc = next(stem.descriptor.parse_file(descriptor_file, "server-descriptor 1.0"))
self.assertEquals("pogonip", desc.nickname)
self.assertEquals("6DABD62BC65D4E6FE620293157FC76968DAB9C9B", desc.fingerprint)
@@ -263,12 +249,8 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4=
where we shouldn't be.
"""
- descriptor_file = open_desc("negative_uptime")
- descriptor_file.readline() # strip header
- descriptor_contents = descriptor_file.read()
- descriptor_file.close()
-
- desc = stem.descriptor.server_descriptor.RelayDescriptor(descriptor_contents)
+ descriptor_file = open(get_resource("negative_uptime"), 'rb')
+ desc = next(stem.descriptor.parse_file(descriptor_file, "server-descriptor 1.0"))
self.assertEquals("TipTor", desc.nickname)
self.assertEquals("137962D4931DBF08A24E843288B8A155D6D2AEDD", desc.fingerprint)
@@ -277,7 +259,7 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4=
# modify the relay version so it's after when the negative uptime bug
# should appear
- descriptor_contents = descriptor_contents.replace("Tor 0.1.1.25", "Tor 0.1.2.7")
+ descriptor_contents = str(desc).replace("Tor 0.1.1.25", "Tor 0.1.2.7")
self.assertRaises(ValueError, stem.descriptor.server_descriptor.RelayDescriptor, descriptor_contents)
def test_bridge_descriptor(self):
@@ -285,10 +267,7 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4=
Parses a bridge descriptor.
"""
- descriptor_file = open_desc("bridge_descriptor")
- descriptor_file.readline() # strip header
- descriptor_contents = descriptor_file.read()
- descriptor_file.close()
+ descriptor_file = open(get_resource("bridge_descriptor"), 'rb')
expected_family = [
"$CE396C72A3D0880F74C064FEA79D68C15BD380B9",
@@ -296,7 +275,7 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4=
"$8C8A470D7C23151665A7B84E75E89FCC205A3304",
]
- desc = stem.descriptor.server_descriptor.BridgeDescriptor(descriptor_contents)
+ desc = next(stem.descriptor.parse_file(descriptor_file, "bridge-server-descriptor 1.0"))
self.assertEquals("Unnamed", desc.nickname)
self.assertEquals("AE54E28ED069CDF45F3009F963EE3B3D6FA26A2E", desc.fingerprint)
self.assertEquals("10.45.227.253", desc.address)
diff --git a/test/unit/descriptor/server_descriptor.py b/test/unit/descriptor/server_descriptor.py
index 00fd96f..66ec460 100644
--- a/test/unit/descriptor/server_descriptor.py
+++ b/test/unit/descriptor/server_descriptor.py
@@ -9,6 +9,7 @@ import unittest
import stem.descriptor.server_descriptor
import stem.exit_policy
import stem.prereq
+import stem.util.str_tools
from stem.descriptor.server_descriptor import RelayDescriptor, BridgeDescriptor
@@ -209,7 +210,7 @@ class TestServerDescriptor(unittest.TestCase):
desc_text += "\ntrailing text that should be ignored, ho hum"
# running _parse_file should provide an iterator with a single descriptor
- desc_iter = stem.descriptor.server_descriptor._parse_file(StringIO.StringIO(desc_text))
+ desc_iter = stem.descriptor.server_descriptor._parse_file(StringIO.StringIO(stem.util.str_tools.to_unicode(desc_text)))
desc_entries = list(desc_iter)
self.assertEquals(1, len(desc_entries))
desc = desc_entries[0]
_______________________________________________
tor-commits mailing list
tor-commits@xxxxxxxxxxxxxxxxxxxx
https://lists.torproject.org/cgi-bin/mailman/listinfo/tor-commits