[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]
[tor-commits] [stem/master] Initial descriptor reading function
commit 07bf976cbf57824e0425ee33bfd9763934915bbb
Author: Damian Johnson <atagar@xxxxxxxxxxxxxx>
Date: Wed Jul 24 18:12:29 2019 -0700
Initial descriptor reading function
Still quite a few rough edges, but works. Exercised with...
import stem.descriptor.collector
collector = stem.descriptor.collector.CollecTor()
f = list(filter(lambda x: 'server-descriptor 1.0' in x._guess_descriptor_types(), collector.files()))[0]
for desc in f.read('/home/atagar/Desktop/foo'):
print(desc)
We're splitting the download and read methods so descriptor archives can
optionally cache locally.
---
stem/descriptor/collector.py | 123 ++++++++++++++++++++++++++++++++++++--
test/unit/descriptor/collector.py | 8 +--
2 files changed, 122 insertions(+), 9 deletions(-)
diff --git a/stem/descriptor/collector.py b/stem/descriptor/collector.py
index 754a96b9..1f7ddb96 100644
--- a/stem/descriptor/collector.py
+++ b/stem/descriptor/collector.py
@@ -51,11 +51,15 @@ With this you can either download and read directly from CollecTor...
import datetime
import json
+import os
import re
import sys
+import tempfile
import time
-from stem.descriptor import Compression
+import stem.util.str_tools
+
+from stem.descriptor import Compression, parse_file
from stem.util import log
try:
@@ -121,7 +125,6 @@ def _download(url, timeout, retries):
:returns: content of the given url
:raises:
- * **IOError** if unable to decompress
* **socket.timeout** if our request timed out
* **urllib2.URLError** for most request failures
@@ -174,8 +177,118 @@ class File(object):
self.last_modified = datetime.datetime.strptime(last_modified, '%Y-%m-%d %H:%M')
self._guessed_type = None
+ self._downloaded_to = None # location we last downloaded to
+
+ def read(self, directory = None, descriptor_type = None, timeout = None, retries = 3):
+ """
+ Provides descriptors from this archive. Descriptors are downloaded or read
+ from disk as follows...
+
+ * If this file has already been downloaded through
+ :func:`~stem.descriptor.collector.CollecTor.download' these descriptors
+ are read from disk.
+
+ * If a **directory** argument is provided and the file is already present
+ these descriptors are read from disk.
+
+ * If a **directory** argument is provided and the file is not present the
+ file is downloaded this location then read.
+
+ * If the file has neither been downloaded and no **directory** argument
+ is provided then the file is downloaded to a temporary directory that's
+ deleted after it is read.
+
+ :param str directory: destination to download into
+ :param str descriptor_type: `descriptor type
+ <https://metrics.torproject.org/collector.html#data-formats>`_, this is
+ guessed if not provided
+ :param int timeout: timeout when connection becomes idle, no timeout
+ applied if **None**
+ :param int retires: maximum attempts to impose
+
+ :returns: iterator for :class:`~stem.descriptor.__init__.Descriptor`
+ instances in the file
+
+ :raises:
+ * **ValueError** if unable to determine the descirptor type
+ * **TypeError** if we cannot parse this descriptor type
+ * **socket.timeout** if our request timed out
+ * **urllib2.URLError** for most request failures
+
+ Note that the urllib2 module may fail with other exception types, in
+ which case we'll pass it along.
+ """
+
+ if descriptor_type is None:
+ descriptor_types = self._guess_descriptor_types()
+
+ if not descriptor_types:
+ raise ValueError("Unable to determine this file's descriptor type")
+ elif len(descriptor_types) > 1:
+ raise ValueError("Unable to determine disambiguate file's descriptor type from %s" % ', '.join(descriptor_types))
+
+ descriptor_type = descriptor_types[0]
+
+ if directory is None:
+ if self._downloaded_to and os.path.exists(self._downloaded_to):
+ directory = os.path.dirname(self._downloaded_to)
+ else:
+ with tempfile.TemporaryDirectory() as tmp_directory:
+ return self.read(tmp_directory, timeout, retries)
+
+ path = self.download(directory, True, timeout, retries)
+ return parse_file(path, descriptor_type)
+
+ def download(self, directory, decompress = True, timeout = None, retries = 3):
+ """
+ Downloads this file to the given location. If a file already exists this is
+ a no-op.
+
+ :param str directory: destination to download into
+ :param bool decompress: decompress written file
+ :param int timeout: timeout when connection becomes idle, no timeout
+ applied if **None**
+ :param int retires: maximum attempts to impose
+
+ :returns: **str** with the path we downloaded to
+
+ :raises:
+ * **socket.timeout** if our request timed out
+ * **urllib2.URLError** for most request failures
+
+ Note that the urllib2 module may fail with other exception types, in
+ which case we'll pass it along.
+ """
+
+ # TODO: If checksums get added to the index we should replace
+ # the path check below to verify that...
+ #
+ # https://trac.torproject.org/projects/tor/ticket/31204
+
+ filename = self.path.split('/')[-1]
+
+ if decompress:
+ filename = filename.rsplit('.', 1)[0]
+
+ path = os.path.join(directory, filename)
+
+ if not os.path.exists(directory):
+ os.makedirs(directory)
+ elif os.path.exists(path):
+ return path # file already exists
+
+ with open(path, 'wb') as output_file:
+ response = _download(COLLECTOR_URL + self.path, timeout, retries)
+
+ if decompress:
+ response = self.compression.decompress(response)
+
+ output_file.write(response)
+
+ self._downloaded_to = path
+ return path
- def guess_descriptor_types(self):
+ def _guess_descriptor_types(self):
"""
Descriptor @type this file is expected to have based on its path. If unable
to determine any this tuple is empty.
@@ -290,7 +403,7 @@ class CollecTor(object):
url = COLLECTOR_URL + 'index/index.json' + extension
response = compression.decompress(_download(url, self.timeout, self.retries))
- self._cached_index = json.loads(response)
+ self._cached_index = json.loads(stem.util.str_tools._to_unicode(response))
self._cached_index_at = time.time()
return self._cached_index
@@ -325,7 +438,7 @@ class CollecTor(object):
elif end and (entry.end is None or entry.end > end):
continue
- if descriptor_type is None or any([desc_type.startswith(descriptor_type) for desc_type in entry.guess_descriptor_types()]):
+ if descriptor_type is None or any([desc_type.startswith(descriptor_type) for desc_type in entry._guess_descriptor_types()]):
matches.append(entry)
return matches
diff --git a/test/unit/descriptor/collector.py b/test/unit/descriptor/collector.py
index 86641f32..914b52b9 100644
--- a/test/unit/descriptor/collector.py
+++ b/test/unit/descriptor/collector.py
@@ -209,13 +209,13 @@ class TestCollector(unittest.TestCase):
def test_guess_descriptor_types(self):
f = File('archive/bridge-descriptors/extra-infos/bridge-extra-infos-2008-05.tar.xz', 377644, '2016-09-04 09:21')
- self.assertEqual(('bridge-extra-info 1.3',), f.guess_descriptor_types())
+ self.assertEqual(('bridge-extra-info 1.3',), f._guess_descriptor_types())
f = File('archive/relay-descriptors/microdescs/microdescs-2014-01.tar.xz', 7515396, '2014-02-07 03:59')
- self.assertEqual(('network-status-microdesc-consensus-3 1.0', 'microdescriptor 1.0'), f.guess_descriptor_types())
+ self.assertEqual(('network-status-microdesc-consensus-3 1.0', 'microdescriptor 1.0'), f._guess_descriptor_types())
f = File('archive/webstats/webstats-2015-03.tar', 20480, '2018-03-19 16:07')
- self.assertEqual((), f.guess_descriptor_types())
+ self.assertEqual((), f._guess_descriptor_types())
f = File('archive/no_such_file.tar', 20480, '2018-03-19 16:07')
- self.assertEqual((), f.guess_descriptor_types())
+ self.assertEqual((), f._guess_descriptor_types())
_______________________________________________
tor-commits mailing list
tor-commits@xxxxxxxxxxxxxxxxxxxx
https://lists.torproject.org/cgi-bin/mailman/listinfo/tor-commits