[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]
[or-cvs] [metrics-utils/master 2/3] Tweak dealing with missing exit lists in visitor.py.
Author: Karsten Loesing <karsten.loesing@xxxxxxx>
Date: Tue, 4 Jan 2011 08:38:40 +0100
Subject: Tweak dealing with missing exit lists in visitor.py.
Commit: 373c773727e15b95dcf7cbd0b7e97f7cc7401728
Skip exit lists based on the file creation timestamp, not based on the
contained ExitAddress lines. The output format is now almost the same as
the one produced by Visitor.java.
---
visitor/Todo | 4 ----
visitor/visitor.py | 7 ++++---
2 files changed, 4 insertions(+), 7 deletions(-)
diff --git a/visitor/Todo b/visitor/Todo
index ff2b0d3..76cf9cc 100644
--- a/visitor/Todo
+++ b/visitor/Todo
@@ -3,8 +3,4 @@ Todo list:
- Identify user-agent strings used by Googlebot et al. and remove them
from the nottor counter, so that people learn about the actual user
ratio.
- - Re-write the Java part in Python once we're happy with its
- functionality.
- - Change visitor.py to skip lines in the web server log for which we
- don't have exit lists available.
diff --git a/visitor/visitor.py b/visitor/visitor.py
index 504971d..704228d 100644
--- a/visitor/visitor.py
+++ b/visitor/visitor.py
@@ -48,6 +48,10 @@ def get_exitlist(exitlist_filepath):
last_exit_date = date(1970, 1, 1) # Unix epoch. Should suffice
for dirpath, _, filenames in os.walk(exitlist_filepath, topdown = False):
for filename in filenames:
+ yr, mo, d, h, m, s, _, _, _ = strptime(filename, '%Y-%m-%d-%H-%M-%S')
+ curr_date = date(yr, mo, d)
+ last_exit_date = max(first_exit_date, curr_date)
+ first_exit_date = min(first_exit_date, curr_date)
fn = os.path.join(dirpath, filename)
try:
f = open(fn)
@@ -55,9 +59,6 @@ def get_exitlist(exitlist_filepath):
if line.startswith('ExitAddress'):
_, ip, dt = line.split(' ', 2)
yr, mo, d, h, m, s, _, _, _ = strptime(dt.rstrip('\s\n'), '%Y-%m-%d %H:%M:%S')
- curr_date = date(yr, mo, d)
- last_exit_date = max(first_exit_date, curr_date)
- first_exit_date = min(first_exit_date, curr_date)
if not ip in exitlist:
exitlist[ip] = []
timestamp = datetime(yr, mo, d, h, m, s)
--
1.7.1