[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]

[or-cvs] [metrics-utils/master 1/2] Count Torbutton users over Tor.



Author: Karsten Loesing <karsten.loesing@xxxxxxx>
Date: Wed, 22 Sep 2010 20:58:33 +0200
Subject: Count Torbutton users over Tor.
Commit: 157b1a26eea8b2927901975eb645fb972e2af496

---
 visitor/ChangeLog    |    4 ++-
 visitor/HOWTO        |    9 ++++---
 visitor/Todo         |    2 -
 visitor/VisiTor.java |   58 +++++++++++++++++++++++++++++++++++++++++++++++--
 visitor/plot.R       |   23 ++++++++++++++-----
 5 files changed, 80 insertions(+), 16 deletions(-)

diff --git a/visitor/ChangeLog b/visitor/ChangeLog
index 962b193..8b00c97 100644
--- a/visitor/ChangeLog
+++ b/visitor/ChangeLog
@@ -1,12 +1,14 @@
 VisiTor change log:
 
-Changes in version 0.0.2 - 2010-09-2?
+Changes in version 0.0.2 - 2010-09-22
   - Don't break if we're given zero exit lists.
   - If we saw zero requests on a day, write "0", not "NA". Only write "NA"
     if we're missing a whole day of logs.
   - Warn if we're missing exit lists and skip that part of the server log.
   - Add fourth parameter to write out the part of the server log with Tor
     user requests.
+  - Add a list of user-agent strings used by Torbutton and count the
+    number of requests coming from Tor _and_ using Torbutton.
 
 Changes in version 0.0.1 - 2010-09-19
   - Initial release
diff --git a/visitor/HOWTO b/visitor/HOWTO
index 49711f8..8d96bce 100644
--- a/visitor/HOWTO
+++ b/visitor/HOWTO
@@ -26,10 +26,11 @@ you should look at:
 
   https://metrics.torproject.org/exonerator.html
 
-This script consists of a Java part and an R part. The Java part parses
-a web server log and the downloaded exit list archives and writes daily
-statistics on requests by Tor users to disk. The optional R part can be
-used to visualize the results.
+This script consists of a Java part and an R part. The Java part parses a
+web server log and the downloaded exit list archives and writes daily
+statistics on requests by Tor users to disk. It further detects user-agent
+strings used by different Torbutton versions to count potential Torbutton
+users over Tor. The optional R part can be used to visualize the results.
 
 ---------------------------------------------------------------------------
 
diff --git a/visitor/Todo b/visitor/Todo
index 92a4bed..fe2b8f0 100644
--- a/visitor/Todo
+++ b/visitor/Todo
@@ -1,7 +1,5 @@
 Todo list:
 
-  - Add a list of user-agent strings used by Torbutton and count the
-    number of requests coming from Tor _and_ using Torbutton.
   - Identify user-agent strings used by Googlebot et al. and remove them
     from the nottor counter, so that people learn about the actual user
     ratio.
diff --git a/visitor/VisiTor.java b/visitor/VisiTor.java
index dd004cd..31bffa5 100644
--- a/visitor/VisiTor.java
+++ b/visitor/VisiTor.java
@@ -23,6 +23,28 @@ public final class VisiTor {
     String outputFile = args[2];
     String serverLogPartTorUsers = args.length == 4 ? args[3] : null;
 
+    /* Initialize regular expressions to detect Torbutton user agents. The
+     * user-agent string in Torbutton was changed in the following Git
+     * commits: 48b8300, 0776f7e, cc15032, 6fdfd5a */
+    SortedMap<String, Pattern> torbuttonUserAgents =
+        new TreeMap<String, Pattern>();
+    torbuttonUserAgents.put("torbutton1_2_5", Pattern.compile(
+        "Mozilla/5\\.0 \\(Windows; U; Windows NT 6\\.1; "
+        + "[a-z]{2}-[A-Z]{2}; rv\\:1\\.9\\.2\\.3\\) "
+        + "Gecko/20100401 Firefox/3\\.6\\.3"));
+    torbuttonUserAgents.put("torbutton1_2_1", Pattern.compile(
+        "Mozilla/5\\.0 \\(Windows; U; Windows NT 5\\.1; "
+        + "en-US; rv\\:1\\.9\\.0\\.7\\) "
+        + "Gecko/2009021910 Firefox/3\\.0\\.7"));
+    torbuttonUserAgents.put("torbutton1_2_0", Pattern.compile(
+        "Mozilla/5\\.0 \\(Windows; U; Windows NT 5\\.1; "
+        + "[a-z]{2}-[A-Z]{2}; rv\\:1\\.8\\.1\\.16\\) "
+        + "Gecko/20080702 Firefox/2\\.0\\.0\\.16"));
+    torbuttonUserAgents.put("torbutton1_2_0rc1", Pattern.compile(
+        "Mozilla/5\\.0 \\(Windows; U; Windows NT 5\\.1; "
+        + "en-US; rv\\:1\\.8\\.1\\.14\\) "
+        + "Gecko/20080404 Firefox/2\\.0\\.0\\.14"));
+
     /* Read the first line of the web server log to let the user know
      * early if we think we can't parse it. */
     System.out.print("Reading the first line of your web server log '"
@@ -143,6 +165,8 @@ public final class VisiTor {
     System.out.print("Parsing web server log file... ");
     Map<String, Integer> torRequests = new HashMap<String, Integer>();
     Map<String, Integer> nonTorRequests = new HashMap<String, Integer>();
+    Map<String, Integer> torbuttonRequests =
+        new HashMap<String, Integer>();
     exitAddressLines.add("ExitAddress 0.0.0.0 1970-01-01");
     SimpleDateFormat exitAddressFormat = new SimpleDateFormat(
         "yyyy-MM-dd HH:mm:ss");
@@ -224,6 +248,18 @@ public final class VisiTor {
           int requestsSoFar = torRequests.containsKey(currentDate)
               ? torRequests.get(currentDate) : 0;
           torRequests.put(currentDate, requestsSoFar + 1);
+          String userAgentString = line.trim().split("\"")[
+              line.trim().split("\"").length - 1];
+          for (Map.Entry<String, Pattern> e :
+              torbuttonUserAgents.entrySet()) {
+            if (e.getValue().matcher(userAgentString).matches()) {
+              String torbuttonRequestKey = currentDate + "," + e.getKey();
+              int requests = torbuttonRequests.containsKey(
+                  torbuttonRequestKey) ? torbuttonRequests.get(
+                  torbuttonRequestKey) : 0;
+              torbuttonRequests.put(torbuttonRequestKey, requests + 1);
+            }
+          }
         } else {
           int requestsSoFar = nonTorRequests.containsKey(currentDate)
               ? nonTorRequests.get(currentDate) : 0;
@@ -253,18 +289,34 @@ public final class VisiTor {
     System.out.print("Writing output to disk... ");
     try {
       BufferedWriter bw = new BufferedWriter(new FileWriter(outputFile));
-      bw.write("date,tor,nottor\n");
+      bw.write("date,tor,nottor");
+      for (String torbuttonUserAgent : torbuttonUserAgents.keySet()) {
+        bw.write("," + torbuttonUserAgent);
+      }
+      bw.write("\n");
       String currentDate = allDates.first(), lastDate = allDates.last();
       while (currentDate.compareTo(lastDate) < 0) {
         if (!torRequests.containsKey(currentDate) &&
             !nonTorRequests.containsKey(currentDate)) {
-          bw.write(currentDate + ",NA,NA\n");
+          bw.write(currentDate + ",NA,NA");
+          for (int i = 0; i < torbuttonUserAgents.size(); i++) {
+            bw.write(",NA");
+          }
+          bw.write("\n");
         } else {
           bw.write(currentDate + ","
               + (torRequests.containsKey(currentDate)
               ? torRequests.get(currentDate) : "0") + ","
               + (nonTorRequests.containsKey(currentDate)
-              ? nonTorRequests.get(currentDate) : "0") + "\n");
+              ? nonTorRequests.get(currentDate) : "0"));
+          for (Map.Entry<String, Pattern> e :
+              torbuttonUserAgents.entrySet()) {
+            String torbuttonRequestKey = currentDate + "," + e.getKey();
+            bw.write("," + (torbuttonRequests.containsKey(
+                torbuttonRequestKey) ? torbuttonRequests.get(
+                torbuttonRequestKey) : "0"));
+          }
+          bw.write("\n");
         }
         try {
           currentDate = isoDateFormat.format(isoDateFormat.parse(
diff --git a/visitor/plot.R b/visitor/plot.R
index e1c0e01..26602dd 100644
--- a/visitor/plot.R
+++ b/visitor/plot.R
@@ -16,11 +16,16 @@ data <- read.csv("out.csv", stringsAsFactors = FALSE)
 # Transform the data into a data frame that has the date and the fraction of
 # requests coming from Tor users
 data <- data.frame(date = as.Date(data$date),
-                   reqfrac = data$tor / (data$tor + data$nottor))
+                   tor = data$tor / (data$tor + data$nottor),
+                   torbutton = rowSums(data[4:length(data)] /
+                               (data$tor + data$nottor)))
 
-# Make a plot with the date on the x axis and the fraction of requests on
-# the y axis
-ggplot(data, aes(x = date, y = reqfrac)) +
+# Transform the data so that we have a single data point per line
+data <- melt(data, id = "date")
+
+# Make a plot with the date on the x axis, the fraction of requests on the
+# y axis, and Tor users vs. Tor + Torbutton users encoded as colors.
+ggplot(data, aes(x = date, y = value, colour = variable)) +
 
 # Make it a line plot
 geom_line() +
@@ -32,8 +37,14 @@ scale_x_date(name = "") +
 # from the graph title; also show fractions as percentages
 scale_y_continuous(name = "", formatter = "percent") +
 
-# Give the graph a title
-opts(title = "Fraction of requests probably coming from Tor users\n")
+# Don't add a legend title, because it's obvious, too. But use nicer names
+# than the column names for the legend.
+scale_colour_hue(name = "", breaks = c("tor", "torbutton"),
+                            labels = c("Tor", "Tor + Torbutton")) +
+
+# Give the graph a title and move the legend to the top.
+opts(title = "Fraction of requests probably coming from Tor users",
+     legend.position = "top")
 
 # Save the graph to disk as visitors.png
 ggsave("visitors.png", width = 8, height = 5, dpi = 72)
-- 
1.7.1