[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]

[tor-commits] [stem/master] Adjust CollecTor File class



commit 4c744badc658d3d93d113972bfbf7cb463298ee4
Author: Damian Johnson <atagar@xxxxxxxxxxxxxx>
Date:   Mon Jul 29 17:35:32 2019 -0700

    Adjust CollecTor File class
    
    Handful of very tiny adjustments. Dropping the unused tar attribute, making
    descriptor guessing a static function (like the others), fixing some minor edge
    cases, etc.
---
 stem/descriptor/collector.py      |  51 ++++++------
 test/unit/descriptor/collector.py | 161 ++++++++++++++++----------------------
 2 files changed, 92 insertions(+), 120 deletions(-)

diff --git a/stem/descriptor/collector.py b/stem/descriptor/collector.py
index d1c90e0e..f76fa225 100644
--- a/stem/descriptor/collector.py
+++ b/stem/descriptor/collector.py
@@ -157,7 +157,6 @@ class File(object):
   :var str path: file path within collector
   :var stem.descriptor.Compression compression: file compression, **None** if
     this cannot be determined
-  :var bool tar: **True** if a tarball, **False** otherwise
   :var int size: size of the file
 
   :var datetime start: beginning of the time range descriptors are for,
@@ -170,13 +169,12 @@ class File(object):
   def __init__(self, path, size, last_modified):
     self.path = path
     self.compression = File._guess_compression(path)
-    self.tar = path.endswith('.tar') or '.tar.' in path
     self.size = size
 
     self.start, self.end = File._guess_time_range(path)
     self.last_modified = datetime.datetime.strptime(last_modified, '%Y-%m-%d %H:%M')
 
-    self._guessed_type = None
+    self._guessed_type = File._guess_descriptor_types(path)
     self._downloaded_to = None  # location we last downloaded to
 
   def read(self, directory = None, descriptor_type = None, timeout = None, retries = 3):
@@ -220,21 +218,21 @@ class File(object):
     """
 
     if descriptor_type is None:
-      descriptor_types = self._guess_descriptor_types()
-
-      if not descriptor_types:
+      if not self._guessed_type:
         raise ValueError("Unable to determine this file's descriptor type")
-      elif len(descriptor_types) > 1:
-        raise ValueError("Unable to determine disambiguate file's descriptor type from %s" % ', '.join(descriptor_types))
+      elif len(self._guessed_type) > 1:
+        raise ValueError("Unable to determine disambiguate file's descriptor type from %s" % ', '.join(self._guessed_type))
 
-      descriptor_type = descriptor_types[0]
+      descriptor_type = self._guessed_type[0]
 
     if directory is None:
       if self._downloaded_to and os.path.exists(self._downloaded_to):
         directory = os.path.dirname(self._downloaded_to)
       else:
         with tempfile.TemporaryDirectory() as tmp_directory:
-          return self.read(tmp_directory, timeout, retries)
+          return self.read(tmp_directory, descriptor_type, timeout, retries)
+
+    # TODO: the following will not work if the tar contains multiple types or a type we do not support
 
     path = self.download(directory, True, timeout, retries)
     return parse_file(path, descriptor_type)
@@ -267,7 +265,7 @@ class File(object):
 
     filename = self.path.split('/')[-1]
 
-    if decompress:
+    if self.compression != Compression.PLAINTEXT and decompress:
       filename = filename.rsplit('.', 1)[0]
 
     path = os.path.join(directory, filename)
@@ -277,36 +275,35 @@ class File(object):
     elif os.path.exists(path):
       return path  # file already exists
 
-    with open(path, 'wb') as output_file:
-      response = _download(COLLECTOR_URL + self.path, timeout, retries)
+    response = _download(COLLECTOR_URL + self.path, timeout, retries)
 
-      if decompress:
-        response = self.compression.decompress(response)
+    if decompress:
+      response = self.compression.decompress(response)
 
+    with open(path, 'wb') as output_file:
       output_file.write(response)
 
     self._downloaded_to = path
     return path
 
-  def _guess_descriptor_types(self):
+  @staticmethod
+  def _guess_descriptor_types(path):
     """
     Descriptor @type this file is expected to have based on its path. If unable
     to determine any this tuple is empty.
 
-    :returns: **tuple** with the descriptor types this file is expected to have
-    """
+    Hopefully this will be replaced with an explicit value in the future:
 
-    if self._guessed_type is None:
-      guessed_type = ()
+      https://trac.torproject.org/projects/tor/ticket/31204
 
-      for path_prefix, types in COLLECTOR_DESC_TYPES.items():
-        if self.path.startswith(path_prefix):
-          guessed_type = (types,) if isinstance(types, str) else types
-          break
+    :returns: **tuple** with the descriptor types this file is expected to have
+    """
 
-      self._guessed_type = guessed_type
+    for path_prefix, types in COLLECTOR_DESC_TYPES.items():
+      if path.startswith(path_prefix):
+        return (types,) if isinstance(types, str) else types
 
-    return self._guessed_type
+    return ()
 
   @staticmethod
   def _guess_compression(path):
@@ -437,7 +434,7 @@ class CollecTor(object):
       elif end and (f.end is None or f.end > end):
         continue
 
-      if descriptor_type is None or any([desc_type.startswith(descriptor_type) for desc_type in f._guess_descriptor_types()]):
+      if descriptor_type is None or any([desc_type.startswith(descriptor_type) for desc_type in f._guessed_type]):
         matches.append(f)
 
     return matches
diff --git a/test/unit/descriptor/collector.py b/test/unit/descriptor/collector.py
index ad0087dd..77c1c460 100644
--- a/test/unit/descriptor/collector.py
+++ b/test/unit/descriptor/collector.py
@@ -21,68 +21,105 @@ except ImportError:
 
 URL_OPEN = 'urllib.request.urlopen' if stem.prereq.is_python_3() else 'urllib2.urlopen'
 
-MINIMAL_INDEX = {
-  'index_created': '2017-12-25 21:06',
-  'build_revision': '56a303e',
-  'path': 'https://collector.torproject.org'
-}
-
-MINIMAL_INDEX_JSON = b'{"index_created":"2017-12-25 21:06","build_revision":"56a303e","path":"https://collector.torproject.org"}'
 
 with open(get_resource('collector_index.json'), 'rb') as index_file:
-  EXAMPLE_INDEX_CONTENT = index_file.read()
+  EXAMPLE_INDEX_JSON = index_file.read()
 
 
 class TestCollector(unittest.TestCase):
+  # tests for the File class
+
+  def test_file_guess_descriptor_types(self):
+    test_values = {
+      'archive/bridge-descriptors/extra-infos/bridge-extra-infos-2008-05.tar.xz': ('bridge-extra-info 1.3',),
+      'archive/relay-descriptors/microdescs/microdescs-2014-01.tar.xz': ('network-status-microdesc-consensus-3 1.0', 'microdescriptor 1.0'),
+      'archive/webstats/webstats-2015-03.tar': (),
+      'archive/no_such_file.tar': (),
+    }
+
+    for path, expected in test_values.items():
+      self.assertEqual(expected, File._guess_descriptor_types(path))
+
+  def test_file_guess_compression(self):
+    test_values = {
+      'archive/relay-descriptors/microdescs/microdescs-2014-01.tar.xz': Compression.LZMA,
+      'archive/webstats/webstats-2015-03.tar': Compression.PLAINTEXT,
+      'recent/relay-descriptors/extra-infos/2019-07-03-02-05-00-extra-infos': Compression.PLAINTEXT,
+    }
+
+    for path, expected in test_values.items():
+      self.assertEqual(expected, File._guess_compression(path))
+
+  def test_file_guess_time_range(self):
+    test_values = {
+      'archive/relay-descriptors/microdescs/microdescs-2014-01.tar.xz':
+        (datetime.datetime(2014, 1, 1), datetime.datetime(2014, 2, 1)),
+      'recent/relay-descriptors/extra-infos/2019-07-03-02-05-00-extra-infos':
+        (datetime.datetime(2019, 7, 3, 2, 5, 0), datetime.datetime(2019, 7, 3, 3, 5, 0)),
+      'archive/relay-descriptors/certs.tar.xz':
+        (None, None),
+      'archive/relay-descriptors/microdescs/microdescs-2014-12.tar.xz':
+        (datetime.datetime(2014, 12, 1), datetime.datetime(2015, 1, 1)),
+      'recent/relay-descriptors/extra-infos/2019-07-03-23-05-00-extra-infos':
+        (datetime.datetime(2019, 7, 3, 23, 5, 0), datetime.datetime(2019, 7, 4, 0, 5, 0))
+    }
+
+    for path, (expected_start, expected_end) in test_values.items():
+      f = File(path, 7515396, '2014-02-07 03:59')
+      self.assertEqual(expected_start, f.start)
+      self.assertEqual(expected_end, f.end)
+
+  # tests for the CollecTor class
+
   @patch(URL_OPEN)
-  def test_download_plaintext(self, urlopen_mock):
-    urlopen_mock.return_value = io.BytesIO(MINIMAL_INDEX_JSON)
+  def test_index_plaintext(self, urlopen_mock):
+    urlopen_mock.return_value = io.BytesIO(EXAMPLE_INDEX_JSON)
 
     collector = CollecTor()
-    self.assertEqual(MINIMAL_INDEX, collector.index(Compression.PLAINTEXT))
+    self.assertEqual(EXAMPLE_INDEX, collector.index(Compression.PLAINTEXT))
     urlopen_mock.assert_called_with('https://collector.torproject.org/index/index.json', timeout = None)
 
   @patch(URL_OPEN)
-  def test_download_gzip(self, urlopen_mock):
+  def test_index_gzip(self, urlopen_mock):
     if not Compression.GZIP.available:
       self.skipTest('(gzip compression unavailable)')
       return
 
     import zlib
-    urlopen_mock.return_value = io.BytesIO(zlib.compress(MINIMAL_INDEX_JSON))
+    urlopen_mock.return_value = io.BytesIO(zlib.compress(EXAMPLE_INDEX_JSON))
 
     collector = CollecTor()
-    self.assertEqual(MINIMAL_INDEX, collector.index(Compression.GZIP))
+    self.assertEqual(EXAMPLE_INDEX, collector.index(Compression.GZIP))
     urlopen_mock.assert_called_with('https://collector.torproject.org/index/index.json.gz', timeout = None)
 
   @patch(URL_OPEN)
-  def test_download_bz2(self, urlopen_mock):
+  def test_index_bz2(self, urlopen_mock):
     if not Compression.BZ2.available:
       self.skipTest('(bz2 compression unavailable)')
       return
 
     import bz2
-    urlopen_mock.return_value = io.BytesIO(bz2.compress(MINIMAL_INDEX_JSON))
+    urlopen_mock.return_value = io.BytesIO(bz2.compress(EXAMPLE_INDEX_JSON))
 
     collector = CollecTor()
-    self.assertEqual(MINIMAL_INDEX, collector.index(Compression.BZ2))
+    self.assertEqual(EXAMPLE_INDEX, collector.index(Compression.BZ2))
     urlopen_mock.assert_called_with('https://collector.torproject.org/index/index.json.bz2', timeout = None)
 
   @patch(URL_OPEN)
-  def test_download_lzma(self, urlopen_mock):
+  def test_index_lzma(self, urlopen_mock):
     if not Compression.LZMA.available:
       self.skipTest('(lzma compression unavailable)')
       return
 
     import lzma
-    urlopen_mock.return_value = io.BytesIO(lzma.compress(MINIMAL_INDEX_JSON))
+    urlopen_mock.return_value = io.BytesIO(lzma.compress(EXAMPLE_INDEX_JSON))
 
     collector = CollecTor()
-    self.assertEqual(MINIMAL_INDEX, collector.index(Compression.LZMA))
+    self.assertEqual(EXAMPLE_INDEX, collector.index(Compression.LZMA))
     urlopen_mock.assert_called_with('https://collector.torproject.org/index/index.json.xz', timeout = None)
 
   @patch(URL_OPEN)
-  def test_download_retries(self, urlopen_mock):
+  def test_index_retries(self, urlopen_mock):
     urlopen_mock.side_effect = IOError('boom')
 
     collector = CollecTor(retries = 0)
@@ -95,11 +132,6 @@ class TestCollector(unittest.TestCase):
     self.assertRaisesRegexp(IOError, 'boom', collector.index)
     self.assertEqual(5, urlopen_mock.call_count)
 
-  @patch(URL_OPEN, Mock(return_value = io.BytesIO(MINIMAL_INDEX_JSON)))
-  def test_index(self):
-    collector = CollecTor()
-    self.assertEqual(MINIMAL_INDEX, collector.index(Compression.PLAINTEXT))
-
   @patch(URL_OPEN, Mock(return_value = io.BytesIO(b'not json')))
   def test_index_malformed_json(self):
     collector = CollecTor()
@@ -118,104 +150,47 @@ class TestCollector(unittest.TestCase):
         collector = CollecTor()
         self.assertRaisesRegexp(IOError, 'Failed to decompress as %s' % compression, collector.index, compression)
 
-  @patch(URL_OPEN, Mock(return_value = io.BytesIO(EXAMPLE_INDEX_CONTENT)))
-  def test_real_index(self):
-    collector = CollecTor()
-    self.assertEqual(EXAMPLE_INDEX, collector.index(compression = Compression.PLAINTEXT))
-
   @patch('stem.descriptor.collector.CollecTor.index', Mock(return_value = EXAMPLE_INDEX))
-  def test_contents(self):
+  def test_files(self):
     collector = CollecTor()
     files = collector.files()
-
     self.assertEqual(85, len(files))
-    test_path = 'archive/relay-descriptors/extra-infos/extra-infos-2007-09.tar.xz'
 
-    extrainfo_file = list(filter(lambda x: x.path == test_path, files))[0]
-    self.assertEqual(test_path, extrainfo_file.path)
+    extrainfo_file = list(filter(lambda x: x.path.endswith('extra-infos-2007-09.tar.xz'), files))[0]
+    self.assertEqual('archive/relay-descriptors/extra-infos/extra-infos-2007-09.tar.xz', extrainfo_file.path)
     self.assertEqual(Compression.LZMA, extrainfo_file.compression)
-    self.assertEqual(True, extrainfo_file.tar)
     self.assertEqual(6459884, extrainfo_file.size)
     self.assertEqual(datetime.datetime(2016, 6, 23, 9, 54), extrainfo_file.last_modified)
 
-  def test_file_compression_attributes(self):
-    f = File('archive/relay-descriptors/microdescs/microdescs-2014-01.tar.xz', 7515396, '2014-02-07 03:59')
-    self.assertEqual(Compression.LZMA, f.compression)
-    self.assertEqual(True, f.tar)
-
-    f = File('archive/webstats/webstats-2015-03.tar', 20480, '2018-03-19 16:07')
-    self.assertEqual(Compression.PLAINTEXT, f.compression)
-    self.assertEqual(True, f.tar)
-
-    f = File('recent/relay-descriptors/extra-infos/2019-07-03-02-05-00-extra-infos', 1162899, '2019-07-03 02:05')
-    self.assertEqual(Compression.PLAINTEXT, f.compression)
-    self.assertEqual(False, f.tar)
-
-  def test_file_date_attributes(self):
-    f = File('archive/relay-descriptors/microdescs/microdescs-2014-01.tar.xz', 7515396, '2014-02-07 03:59')
-    self.assertEqual(datetime.datetime(2014, 1, 1), f.start)
-    self.assertEqual(datetime.datetime(2014, 2, 1), f.end)
-
-    f = File('recent/relay-descriptors/extra-infos/2019-07-03-02-05-00-extra-infos', 1162899, '2019-07-03 02:05')
-    self.assertEqual(datetime.datetime(2019, 7, 3, 2, 5, 0), f.start)
-    self.assertEqual(datetime.datetime(2019, 7, 3, 3, 5, 0), f.end)
-
-    f = File('archive/relay-descriptors/certs.tar.xz', 144696, '2019-07-03 03:29')
-    self.assertEqual(None, f.start)
-    self.assertEqual(None, f.end)
-
-    # check date boundaries
-
-    f = File('archive/relay-descriptors/microdescs/microdescs-2014-12.tar.xz', 7515396, '2014-02-07 03:59')
-    self.assertEqual(datetime.datetime(2015, 1, 1), f.end)
-
-    f = File('recent/relay-descriptors/extra-infos/2019-07-03-23-05-00-extra-infos', 1162899, '2019-07-03 02:05')
-    self.assertEqual(datetime.datetime(2019, 7, 4, 0, 5, 0), f.end)
-
   @patch('stem.descriptor.collector.CollecTor.index', Mock(return_value = EXAMPLE_INDEX))
-  def test_file_query_by_type(self):
+  def test_files_by_descriptor_type(self):
     collector = CollecTor()
 
-    expected = [
+    self.assertEqual([
       'archive/relay-descriptors/server-descriptors/server-descriptors-2005-12.tar.xz',
       'archive/relay-descriptors/server-descriptors/server-descriptors-2006-02.tar.xz',
       'archive/relay-descriptors/server-descriptors/server-descriptors-2006-03.tar.xz',
       'recent/relay-descriptors/server-descriptors/2019-07-03-02-05-00-server-descriptors',
       'recent/relay-descriptors/server-descriptors/2019-07-03-03-05-00-server-descriptors',
       'recent/relay-descriptors/server-descriptors/2019-07-03-04-05-00-server-descriptors',
-    ]
-
-    self.assertEqual(expected, list(map(lambda x: x.path, collector.files(descriptor_type = 'server-descriptor'))))
+    ], [f.path for f in collector.files(descriptor_type = 'server-descriptor')])
 
   @patch('stem.descriptor.collector.CollecTor.index', Mock(return_value = EXAMPLE_INDEX))
-  def test_file_query_by_date(self):
+  def test_file_by_date(self):
     collector = CollecTor()
 
     self.assertEqual([
       'recent/relay-descriptors/server-descriptors/2019-07-03-02-05-00-server-descriptors',
       'recent/relay-descriptors/server-descriptors/2019-07-03-03-05-00-server-descriptors',
       'recent/relay-descriptors/server-descriptors/2019-07-03-04-05-00-server-descriptors',
-    ], list(map(lambda x: x.path, collector.files(descriptor_type = 'server-descriptor', start = datetime.datetime(2007, 1, 1)))))
+    ], [f.path for f in collector.files(descriptor_type = 'server-descriptor', start = datetime.datetime(2007, 1, 1))])
 
     self.assertEqual([
       'archive/relay-descriptors/server-descriptors/server-descriptors-2005-12.tar.xz',
       'archive/relay-descriptors/server-descriptors/server-descriptors-2006-02.tar.xz',
       'archive/relay-descriptors/server-descriptors/server-descriptors-2006-03.tar.xz',
-    ], list(map(lambda x: x.path, collector.files(descriptor_type = 'server-descriptor', end = datetime.datetime(2007, 1, 1)))))
+    ], [f.path for f in collector.files(descriptor_type = 'server-descriptor', end = datetime.datetime(2007, 1, 1))])
 
     self.assertEqual([
       'archive/relay-descriptors/server-descriptors/server-descriptors-2006-03.tar.xz',
-    ], list(map(lambda x: x.path, collector.files(descriptor_type = 'server-descriptor', start = datetime.datetime(2006, 2, 10), end = datetime.datetime(2007, 1, 1)))))
-
-  def test_guess_descriptor_types(self):
-    f = File('archive/bridge-descriptors/extra-infos/bridge-extra-infos-2008-05.tar.xz', 377644, '2016-09-04 09:21')
-    self.assertEqual(('bridge-extra-info 1.3',), f._guess_descriptor_types())
-
-    f = File('archive/relay-descriptors/microdescs/microdescs-2014-01.tar.xz', 7515396, '2014-02-07 03:59')
-    self.assertEqual(('network-status-microdesc-consensus-3 1.0', 'microdescriptor 1.0'), f._guess_descriptor_types())
-
-    f = File('archive/webstats/webstats-2015-03.tar', 20480, '2018-03-19 16:07')
-    self.assertEqual((), f._guess_descriptor_types())
-
-    f = File('archive/no_such_file.tar', 20480, '2018-03-19 16:07')
-    self.assertEqual((), f._guess_descriptor_types())
+    ], [f.path for f in collector.files(descriptor_type = 'server-descriptor', start = datetime.datetime(2006, 2, 10), end = datetime.datetime(2007, 1, 1))])



_______________________________________________
tor-commits mailing list
tor-commits@xxxxxxxxxxxxxxxxxxxx
https://lists.torproject.org/cgi-bin/mailman/listinfo/tor-commits