[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]
[tor-commits] [ooni-probe/master] Add report archival script
commit cf8a8cc3a7a5798025f2c7d33bed3f8c8824f1af
Author: aagbsn <aagbsn@xxxxxxxx>
Date: Mon Dec 24 04:15:54 2012 +0000
Add report archival script
---
scripts/archive_reports.py | 166 ++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 166 insertions(+), 0 deletions(-)
diff --git a/scripts/archive_reports.py b/scripts/archive_reports.py
new file mode 100755
index 0000000..f48fa08
--- /dev/null
+++ b/scripts/archive_reports.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python
+import yaml
+import sys
+import glob
+import fcntl
+import os
+import re
+from ipaddr import IPAddress
+from datetime import timedelta
+from datetime import datetime
+from ooni.otime import fromTimestamp, timestamp
+from ooni.otime import InvalidTimestampFormat, utcDateNow
+from ooni.utils import log
+
+###############################################################################
+# You can set some config options here #
+###############################################################################
+report_age = 1 # hours
+report_archive_dir = '/home/user/oonib/reports/archived'
+report_source_dir = '/home/user/oonib/reports'
+valid_test_versions = ['0.1', '0.1.1', '0.4', '0.1.3']
+default_probe_cc = '??'
+target_permission = 0444
+path_permission = 0755
+retry_attempts = 100
+###############################################################################
+
+now = utcDateNow()
+delta = timedelta(hours=report_age)
+
+def filter_reports_by_age(report):
+ try:
+ ts,__,__ = os.path.basename(report).split('_')
+ if now - fromTimestamp(ts) > delta:
+ return True
+ except (InvalidTimestampFormat, ValueError):
+ return False
+
+def validate_fields(fields):
+ log.debug("Report fields are: %s" % fields)
+
+ # check report version
+ if fields['test_version'] not in valid_test_versions:
+ log.err("Report submitted with invalid report version!")
+ return False
+
+ # check report CC
+ #XXX: confirm what value we use for default CC and whether
+ # or not we should support > 2 character CC
+ if fields['probe_cc'] is None:
+ fields['probe_cc'] = default_probe_cc
+ if not re.match('[A-Z\?]{2,4}', fields['probe_cc'].upper()):
+ log.err("Report submitted with invalid CC!")
+ return False
+
+ # check report ASN
+ if fields['probe_asn'] is None:
+ fields['probe_asn'] = 'AS0'
+ if not re.match('^AS[0-9]{1,10}', fields['probe_asn'].upper()):
+ log.err("Report submitted with invalid AS Number!")
+ return False
+
+ # check report timestamp
+ try:
+ datetime_ts = datetime.fromtimestamp(fields['start_time'])
+ datetime_str = timestamp(datetime_ts)
+ except InvalidTimestampFormat:
+ log.err("Report submitted with invalid timestamp!")
+ return False
+
+ # check report IP
+ try:
+ IPAddress(fields['probe_ip'])
+ except ValueError:
+ log.err("Report submitted with invalid IP Address!")
+ return False
+
+ # all looks good!
+ return True
+
+def get_report_header_fields(report_header):
+ required_fields = ['probe_asn', 'probe_cc', 'probe_ip', 'start_time',
+ 'test_name', 'test_version']
+ try:
+ return dict([(k,report_header[k]) for k in required_fields ])
+ except KeyError:
+ return None
+
+def get_target_or_fail(fields, report):
+ # set the target filename
+ reportFormatVersion = fields['test_version']
+ CC = fields['probe_cc']
+ # XXX: wouldn't hurt to check timestamp for sanity again?
+ dateInISO8601Format,__,__ = os.path.basename(report).split('_')
+ probeASNumber = fields['probe_asn']
+
+ # make sure path reportFormatVersion/CC exists
+ path = os.path.abspath(report_archive_dir)
+ for component in [reportFormatVersion, CC]:
+ path = os.path.join(path, component)
+ if not os.path.isdir(path):
+ try:
+ os.mkdir(path, path_permission)
+ log.debug("mkdir path: %s" % path)
+ except OSError:
+ return None
+
+ # if the target file already exists, try to find another filename
+ filename = "%s_%s.yamloo" % (dateInISO8601Format, probeASNumber)
+ target = os.path.join(path, filename)
+
+ # try to get a unique filename. os.open as used below requires
+ # that the file not already exist
+ naming_attempts = 1
+ while os.path.exists(target) and naming_attempts < retry_attempts:
+ filename = "%s_%s.%d.yamloo" % (dateInISO8601Format, probeASNumber,
+ naming_attempts)
+ target = os.path.join(path, filename)
+ naming_attempts = naming_attempts + 1
+
+ if naming_attempts >= retry_attempts:
+ log.err("Failed getting unique filename %d times; skipping" % i)
+ return None
+ return target
+
+# grab list of reports
+reports = glob.glob(report_source_dir+'/*')
+reports_to_archive = filter(filter_reports_by_age, reports)
+
+# iterate over the reports to archive
+for report in reports_to_archive:
+ log.debug("Parsing report: %s" % report)
+ try:
+ #XXX: verify that os.fdopen works as expected
+ f = os.fdopen(os.open(report, os.O_RDONLY|os.O_EXCL|os.O_NONBLOCK))
+ except IOError:
+ log.err("Unable to get exclusive lock on %s; skipping" % report)
+ continue
+
+ # parse the header and validate it
+ yamloo = yaml.safe_load_all(f)
+ report_header = yamloo.next()
+ fields = get_report_header_fields(report_header)
+ if not validate_fields(fields):
+ continue
+
+ # get a target filename or fail
+ target = get_target_or_fail(fields, report)
+ if not target:
+ continue
+
+ log.debug("target: %s" % target)
+
+ try:
+ #XXX: My system does not have os.O_EXLOCK. Verify this works as is.
+ g = os.fdopen(os.open(target, os.O_CREAT|os.O_EXCL|os.O_NONBLOCK))
+
+ os.rename(report, target)
+ os.chmod(target, target_permission)
+ f.close()
+ g.close()
+
+ except IOError:
+ # unable to lock the file... still held open?
+ log.err("Failed to lock target file. Possible race condition!")
+ continue
_______________________________________________
tor-commits mailing list
tor-commits@xxxxxxxxxxxxxxxxxxxx
https://lists.torproject.org/cgi-bin/mailman/listinfo/tor-commits