[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]
[or-cvs] [metrics-utils/master 1/2] Count Torbutton users over Tor.
Author: Karsten Loesing <karsten.loesing@xxxxxxx>
Date: Wed, 22 Sep 2010 20:58:33 +0200
Subject: Count Torbutton users over Tor.
Commit: 157b1a26eea8b2927901975eb645fb972e2af496
---
visitor/ChangeLog | 4 ++-
visitor/HOWTO | 9 ++++---
visitor/Todo | 2 -
visitor/VisiTor.java | 58 +++++++++++++++++++++++++++++++++++++++++++++++--
visitor/plot.R | 23 ++++++++++++++-----
5 files changed, 80 insertions(+), 16 deletions(-)
diff --git a/visitor/ChangeLog b/visitor/ChangeLog
index 962b193..8b00c97 100644
--- a/visitor/ChangeLog
+++ b/visitor/ChangeLog
@@ -1,12 +1,14 @@
VisiTor change log:
-Changes in version 0.0.2 - 2010-09-2?
+Changes in version 0.0.2 - 2010-09-22
- Don't break if we're given zero exit lists.
- If we saw zero requests on a day, write "0", not "NA". Only write "NA"
if we're missing a whole day of logs.
- Warn if we're missing exit lists and skip that part of the server log.
- Add fourth parameter to write out the part of the server log with Tor
user requests.
+ - Add a list of user-agent strings used by Torbutton and count the
+ number of requests coming from Tor _and_ using Torbutton.
Changes in version 0.0.1 - 2010-09-19
- Initial release
diff --git a/visitor/HOWTO b/visitor/HOWTO
index 49711f8..8d96bce 100644
--- a/visitor/HOWTO
+++ b/visitor/HOWTO
@@ -26,10 +26,11 @@ you should look at:
https://metrics.torproject.org/exonerator.html
-This script consists of a Java part and an R part. The Java part parses
-a web server log and the downloaded exit list archives and writes daily
-statistics on requests by Tor users to disk. The optional R part can be
-used to visualize the results.
+This script consists of a Java part and an R part. The Java part parses a
+web server log and the downloaded exit list archives and writes daily
+statistics on requests by Tor users to disk. It further detects user-agent
+strings used by different Torbutton versions to count potential Torbutton
+users over Tor. The optional R part can be used to visualize the results.
---------------------------------------------------------------------------
diff --git a/visitor/Todo b/visitor/Todo
index 92a4bed..fe2b8f0 100644
--- a/visitor/Todo
+++ b/visitor/Todo
@@ -1,7 +1,5 @@
Todo list:
- - Add a list of user-agent strings used by Torbutton and count the
- number of requests coming from Tor _and_ using Torbutton.
- Identify user-agent strings used by Googlebot et al. and remove them
from the nottor counter, so that people learn about the actual user
ratio.
diff --git a/visitor/VisiTor.java b/visitor/VisiTor.java
index dd004cd..31bffa5 100644
--- a/visitor/VisiTor.java
+++ b/visitor/VisiTor.java
@@ -23,6 +23,28 @@ public final class VisiTor {
String outputFile = args[2];
String serverLogPartTorUsers = args.length == 4 ? args[3] : null;
+ /* Initialize regular expressions to detect Torbutton user agents. The
+ * user-agent string in Torbutton was changed in the following Git
+ * commits: 48b8300, 0776f7e, cc15032, 6fdfd5a */
+ SortedMap<String, Pattern> torbuttonUserAgents =
+ new TreeMap<String, Pattern>();
+ torbuttonUserAgents.put("torbutton1_2_5", Pattern.compile(
+ "Mozilla/5\\.0 \\(Windows; U; Windows NT 6\\.1; "
+ + "[a-z]{2}-[A-Z]{2}; rv\\:1\\.9\\.2\\.3\\) "
+ + "Gecko/20100401 Firefox/3\\.6\\.3"));
+ torbuttonUserAgents.put("torbutton1_2_1", Pattern.compile(
+ "Mozilla/5\\.0 \\(Windows; U; Windows NT 5\\.1; "
+ + "en-US; rv\\:1\\.9\\.0\\.7\\) "
+ + "Gecko/2009021910 Firefox/3\\.0\\.7"));
+ torbuttonUserAgents.put("torbutton1_2_0", Pattern.compile(
+ "Mozilla/5\\.0 \\(Windows; U; Windows NT 5\\.1; "
+ + "[a-z]{2}-[A-Z]{2}; rv\\:1\\.8\\.1\\.16\\) "
+ + "Gecko/20080702 Firefox/2\\.0\\.0\\.16"));
+ torbuttonUserAgents.put("torbutton1_2_0rc1", Pattern.compile(
+ "Mozilla/5\\.0 \\(Windows; U; Windows NT 5\\.1; "
+ + "en-US; rv\\:1\\.8\\.1\\.14\\) "
+ + "Gecko/20080404 Firefox/2\\.0\\.0\\.14"));
+
/* Read the first line of the web server log to let the user know
* early if we think we can't parse it. */
System.out.print("Reading the first line of your web server log '"
@@ -143,6 +165,8 @@ public final class VisiTor {
System.out.print("Parsing web server log file... ");
Map<String, Integer> torRequests = new HashMap<String, Integer>();
Map<String, Integer> nonTorRequests = new HashMap<String, Integer>();
+ Map<String, Integer> torbuttonRequests =
+ new HashMap<String, Integer>();
exitAddressLines.add("ExitAddress 0.0.0.0 1970-01-01");
SimpleDateFormat exitAddressFormat = new SimpleDateFormat(
"yyyy-MM-dd HH:mm:ss");
@@ -224,6 +248,18 @@ public final class VisiTor {
int requestsSoFar = torRequests.containsKey(currentDate)
? torRequests.get(currentDate) : 0;
torRequests.put(currentDate, requestsSoFar + 1);
+ String userAgentString = line.trim().split("\"")[
+ line.trim().split("\"").length - 1];
+ for (Map.Entry<String, Pattern> e :
+ torbuttonUserAgents.entrySet()) {
+ if (e.getValue().matcher(userAgentString).matches()) {
+ String torbuttonRequestKey = currentDate + "," + e.getKey();
+ int requests = torbuttonRequests.containsKey(
+ torbuttonRequestKey) ? torbuttonRequests.get(
+ torbuttonRequestKey) : 0;
+ torbuttonRequests.put(torbuttonRequestKey, requests + 1);
+ }
+ }
} else {
int requestsSoFar = nonTorRequests.containsKey(currentDate)
? nonTorRequests.get(currentDate) : 0;
@@ -253,18 +289,34 @@ public final class VisiTor {
System.out.print("Writing output to disk... ");
try {
BufferedWriter bw = new BufferedWriter(new FileWriter(outputFile));
- bw.write("date,tor,nottor\n");
+ bw.write("date,tor,nottor");
+ for (String torbuttonUserAgent : torbuttonUserAgents.keySet()) {
+ bw.write("," + torbuttonUserAgent);
+ }
+ bw.write("\n");
String currentDate = allDates.first(), lastDate = allDates.last();
while (currentDate.compareTo(lastDate) < 0) {
if (!torRequests.containsKey(currentDate) &&
!nonTorRequests.containsKey(currentDate)) {
- bw.write(currentDate + ",NA,NA\n");
+ bw.write(currentDate + ",NA,NA");
+ for (int i = 0; i < torbuttonUserAgents.size(); i++) {
+ bw.write(",NA");
+ }
+ bw.write("\n");
} else {
bw.write(currentDate + ","
+ (torRequests.containsKey(currentDate)
? torRequests.get(currentDate) : "0") + ","
+ (nonTorRequests.containsKey(currentDate)
- ? nonTorRequests.get(currentDate) : "0") + "\n");
+ ? nonTorRequests.get(currentDate) : "0"));
+ for (Map.Entry<String, Pattern> e :
+ torbuttonUserAgents.entrySet()) {
+ String torbuttonRequestKey = currentDate + "," + e.getKey();
+ bw.write("," + (torbuttonRequests.containsKey(
+ torbuttonRequestKey) ? torbuttonRequests.get(
+ torbuttonRequestKey) : "0"));
+ }
+ bw.write("\n");
}
try {
currentDate = isoDateFormat.format(isoDateFormat.parse(
diff --git a/visitor/plot.R b/visitor/plot.R
index e1c0e01..26602dd 100644
--- a/visitor/plot.R
+++ b/visitor/plot.R
@@ -16,11 +16,16 @@ data <- read.csv("out.csv", stringsAsFactors = FALSE)
# Transform the data into a data frame that has the date and the fraction of
# requests coming from Tor users
data <- data.frame(date = as.Date(data$date),
- reqfrac = data$tor / (data$tor + data$nottor))
+ tor = data$tor / (data$tor + data$nottor),
+ torbutton = rowSums(data[4:length(data)] /
+ (data$tor + data$nottor)))
-# Make a plot with the date on the x axis and the fraction of requests on
-# the y axis
-ggplot(data, aes(x = date, y = reqfrac)) +
+# Transform the data so that we have a single data point per line
+data <- melt(data, id = "date")
+
+# Make a plot with the date on the x axis, the fraction of requests on the
+# y axis, and Tor users vs. Tor + Torbutton users encoded as colors.
+ggplot(data, aes(x = date, y = value, colour = variable)) +
# Make it a line plot
geom_line() +
@@ -32,8 +37,14 @@ scale_x_date(name = "") +
# from the graph title; also show fractions as percentages
scale_y_continuous(name = "", formatter = "percent") +
-# Give the graph a title
-opts(title = "Fraction of requests probably coming from Tor users\n")
+# Don't add a legend title, because it's obvious, too. But use nicer names
+# than the column names for the legend.
+scale_colour_hue(name = "", breaks = c("tor", "torbutton"),
+ labels = c("Tor", "Tor + Torbutton")) +
+
+# Give the graph a title and move the legend to the top.
+opts(title = "Fraction of requests probably coming from Tor users",
+ legend.position = "top")
# Save the graph to disk as visitors.png
ggsave("visitors.png", width = 8, height = 5, dpi = 72)
--
1.7.1