[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]
[tor-commits] [stem/master] Handling descriptor files in a depth first fashion
commit 9f76969739eccf740da0f77378bcabc5672a85bb
Author: Damian Johnson <atagar@xxxxxxxxxxxxxx>
Date: Sat Apr 14 18:30:40 2012 -0700
Handling descriptor files in a depth first fashion
Directories enqueued all of the files that it contained prior to processing
them which has a couple obvious disadvantages...
- huge targets like the root directory or years worth of descriptors can
consume lots of memory with the paths alone
- this could easily cause us to have a huge startup time before we provided the
caller any descriptors
This was stupid, depth first parsing makes much more sense.
---
stem/descriptor/reader.py | 55 +++++++++++++++++++++++---------------------
1 files changed, 29 insertions(+), 26 deletions(-)
diff --git a/stem/descriptor/reader.py b/stem/descriptor/reader.py
index 66ed65a..01f8809 100644
--- a/stem/descriptor/reader.py
+++ b/stem/descriptor/reader.py
@@ -327,36 +327,12 @@ class DescriptorReader:
# adds all of the files that it contains
for root, _, files in os.walk(target, followlinks = self._follow_links):
for filename in files:
- remaining_files.append(os.path.join(root, filename))
+ self._handle_file(os.path.join(root, filename), new_processed_files)
# this can take a while if, say, we're including the root directory
if self._is_stopped.is_set(): break
else:
- # This is a file. Register its last modified timestamp and check if
- # it's a file that we should skip.
-
- last_modified = int(os.stat(target).st_mtime)
- last_used = self._processed_files.get(target)
- new_processed_files[target] = last_modified
-
- if last_used and last_used >= last_modified:
- self._notify_skip_listeners(target, AlreadyRead(last_modified, last_used))
- continue
-
- # The mimetypes module only checks the file extension. To actually
- # check the content (like the 'file' command) we'd need something like
- # pymagic (https://github.com/cloudburst/pymagic).
-
- target_type = mimetypes.guess_type(target)
-
- if target_type[0] in (None, 'text/plain'):
- # either '.txt' or an unknown type
- self._handle_descriptor_file(target)
- elif tarfile.is_tarfile(target):
- # handles gzip, bz2, and decompressed tarballs among others
- self._handle_archive(target)
- else:
- self._notify_skip_listeners(target, UnrecognizedType(target_type))
+ self._handle_file(target, new_processed_files)
self._processed_files = new_processed_files
@@ -377,6 +353,33 @@ class DescriptorReader:
self._iter_notice.wait()
self._iter_notice.clear()
+ def _handle_file(self, target, new_processed_files):
+ # This is a file. Register its last modified timestamp and check if
+ # it's a file that we should skip.
+
+ last_modified = int(os.stat(target).st_mtime)
+ last_used = self._processed_files.get(target)
+ new_processed_files[target] = last_modified
+
+ if last_used and last_used >= last_modified:
+ self._notify_skip_listeners(target, AlreadyRead(last_modified, last_used))
+ return
+
+ # The mimetypes module only checks the file extension. To actually
+ # check the content (like the 'file' command) we'd need something like
+ # pymagic (https://github.com/cloudburst/pymagic).
+
+ target_type = mimetypes.guess_type(target)
+
+ if target_type[0] in (None, 'text/plain'):
+ # either '.txt' or an unknown type
+ self._handle_descriptor_file(target)
+ elif tarfile.is_tarfile(target):
+ # handles gzip, bz2, and decompressed tarballs among others
+ self._handle_archive(target)
+ else:
+ self._notify_skip_listeners(target, UnrecognizedType(target_type))
+
def _handle_descriptor_file(self, target):
try:
with open(target) as target_file:
_______________________________________________
tor-commits mailing list
tor-commits@xxxxxxxxxxxxxxxxxxxx
https://lists.torproject.org/cgi-bin/mailman/listinfo/tor-commits