[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]
[tor-commits] [webstats/master] Throw away http lines
commit 93c48ff3286c9db082973bdf4a7b10fc8edcc8ee
Author: Sebastian Hahn <sebastian@xxxxxxxxxxxxxx>
Date: Wed Nov 18 16:31:30 2015 +0100
Throw away http lines
---
src/sanitize.py | 14 +++++++++-----
src/treat_new_logs.sh | 8 ++++++--
2 files changed, 15 insertions(+), 7 deletions(-)
diff --git a/src/sanitize.py b/src/sanitize.py
index 0c5de19..d6baf41 100755
--- a/src/sanitize.py
+++ b/src/sanitize.py
@@ -6,7 +6,7 @@ The following sanitizing steps are performed on data read from stdin:
2. Die if other hosts than '0.0.0.0' or '0.0.0.1' are specified.
3. Discard all lines with other methods than GET.
4. Die if a protocol other than HTTP is used.
- 5. Discard all lines with status code 404.
+ 5. Discard all lines with status code 400 and 404.
6. Override client with '-'.
7. Override user with '-'.
8. Override time with '00:00:00 +0000'.
@@ -46,7 +46,7 @@ if matched is None:
sys.exit(1)
today = dateutil.parser.parse(matched.group(1))
-is_valid_regex = re.compile(r'^0\.0\.0\.([01]) - - \[(\d{2}/(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)/\d{4}):00:00:00 \+0000\] "([^ ]*) ([^ ?]*[?]?|)[^ ]* HTTP([^"]*)" (-|\d*) (-|\d*) "([^\"]|\\|\")*" "([^"]|\")*" .*[^ ]$')
+is_valid_regex = re.compile(r'^0\.0\.0\.([01]) - - \[(\d{2}/(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)/\d{4}):00:00:00 \+0000\] "([^ ]*) ([^ ?]*[?]?|).* HTTP([^"]*)" (-|\d*) (-|\d*) "([^\"]|\\|\")*" "([^"]|\")*" .*[^ ]$')
sanitized_regex = r'0.0.0.\1 - - [\2:00:00:00 +0000] "\4 \5 HTTP\6" \7 \8 "-" "-" -\n'
day_before = today - datetime.timedelta(days=1)
@@ -59,18 +59,22 @@ with open(yesterday_fname, 'a') as file_old:
matched = is_valid_regex.match(line)
if matched is None:
print(line, "Last line does not match critera", file=sys.stderr)
- sys.exit(1)
+ continue
date = dateutil.parser.parse(matched.group(2))
if today != date and day_before != date:
print(line, "Last line does not match date constraints. today:", today,
" day before:", day_before, " date:", date, file=sys.stderr)
- sys.exit(1)
+ continue
requesttype = matched.group(4)
if requesttype != "GET" and requesttype != "HEAD":
#print(matched.expand(sanitized_regex), file=sys.stderr, end="")
continue
- if matched.group(7) == "404":
+ if matched.group(7) == "404" or matched.group(7) == "400":
+ #print(matched.expand(sanitized_regex), file=sys.stderr, end="")
+ continue
+
+ if matched.group(1) == "0":
#print(matched.expand(sanitized_regex), file=sys.stderr, end="")
continue
diff --git a/src/treat_new_logs.sh b/src/treat_new_logs.sh
index 147ebec..144eec3 100755
--- a/src/treat_new_logs.sh
+++ b/src/treat_new_logs.sh
@@ -8,6 +8,8 @@ SCRIPTDIR="${BASEDIR}/bin/"
BASEINCOMINGDIR="${BASEDIR}/incoming/"
+INTERESTING_HOSTS="www.torproject.org dist.torproject.org"
+
cd "${BASEINCOMINGDIR}"
for host in *; do
INCOMINGDIR="${BASEINCOMINGDIR}/${host}/"
@@ -30,12 +32,12 @@ for host in *; do
cp "${INCOMINGDIR}/${file}" "${WORKDIR}/${file}"
cd "${WORKDIR}"
gunzip ${file}
- COMPLETED=$(${SCRIPTDIR}/sanitize.py "${basefile}" "${WORKDIR}")
+ COMPLETED=$(${SCRIPTDIR}/sanitize.py "${basefile}" "${WORKDIR}" 2>>"${WORKDIR}/errors")
COMPLETED_BASE=$(basename $COMPLETED)
COMPLETED_BASE=${COMPLETED_BASE%_sanitized}
sort "${COMPLETED}" > "${COMPLETED}_sorted"
xz -ck9e "${COMPLETED}_sorted" > "${OUTDIR}/${COMPLETED_BASE}.xz"
- mv "${COMPLETED}_sorted" "${WORKDIR_AWSTATS}"
+ mv "${OUTDIR}/${COMPLETED_BASE}.xz" "${WORKDIR_AWSTATS}"
rm "${WORKDIR}/${basefile}"
rm "${WORKDIR}/${COMPLETED_BASE}_sanitized"
@@ -43,3 +45,5 @@ for host in *; do
done
done
+# Now that we have all output files, process them with awstats
+
_______________________________________________
tor-commits mailing list
tor-commits@xxxxxxxxxxxxxxxxxxxx
https://lists.torproject.org/cgi-bin/mailman/listinfo/tor-commits