[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]
[tor-commits] [metrics-tasks/master] Add parsing and graphing code for #3260.
commit e435e803f808994aa42de70e4b8af048c0005509
Author: Karsten Loesing <karsten.loesing@xxxxxxx>
Date: Tue Mar 20 09:15:23 2012 +0100
Add parsing and graphing code for #3260.
---
task-3260/.gitignore | 8 ++
task-3260/ExtractClientSpeedTrends.java | 150 +++++++++++++++++++++++++++++++
task-3260/README | 24 +++++
task-3260/client-speed-trends.R | 30 ++++++
task-3260/run.sh | 3 +
5 files changed, 215 insertions(+), 0 deletions(-)
diff --git a/task-3260/.gitignore b/task-3260/.gitignore
new file mode 100644
index 0000000..8231133
--- /dev/null
+++ b/task-3260/.gitignore
@@ -0,0 +1,8 @@
+in/
+status/
+out/
+*.pdf
+*.jar
+*.class
+*.png
+
diff --git a/task-3260/ExtractClientSpeedTrends.java b/task-3260/ExtractClientSpeedTrends.java
new file mode 100644
index 0000000..5a2e36e
--- /dev/null
+++ b/task-3260/ExtractClientSpeedTrends.java
@@ -0,0 +1,150 @@
+import java.io.*;
+import java.text.*;
+import java.util.*;
+import org.torproject.descriptor.*;
+
+/* Extract client speed trends from download times of network status
+ * consensuses. Directory mirrors report mininum, maximum, median,
+ * quartiles, and deciles of measured client bandwidths in their
+ * extra-info descriptors. Combine statistics from the top 50 percent of
+ * directory mirrors by finished downloads, under the assumption that
+ * these directory mirrors had enough available bandwidth to serve even
+ * fast clients. Calculate statistics on a given day by reconstructing
+ * original client bandwidths from reported percentiles by assuming a
+ * linear increase between two reported percentiles and calculating
+ * percentiles of all client bandwidth values on that day. */
+public class ExtractClientSpeedTrends {
+ public static void main(String[] args) throws Exception {
+
+ System.out.println(new Date() + " Starting.");
+
+ /* Read dirreq-v3-tunneled-dl lines from in/extra-infos/ and append
+ * them to files status/YYYY-MM-DD, prefixing lines with the relay
+ * fingerprint to detect duplicates. */
+ System.out.println(new Date() + " Reading in/extra-infos/* ...");
+ SimpleDateFormat dateFormatter = new SimpleDateFormat("yyyy-MM-dd");
+ dateFormatter.setTimeZone(TimeZone.getTimeZone("UTC"));
+ SimpleDateFormat dateTimeFormatter = new SimpleDateFormat(
+ "yyyy-MM-dd hh:mm:ss");
+ dateFormatter.setTimeZone(TimeZone.getTimeZone("UTC"));
+ dateTimeFormatter.setTimeZone(TimeZone.getTimeZone("UTC"));
+ DescriptorReader extraInfoReader = DescriptorSourceFactory
+ .createDescriptorReader();
+ extraInfoReader.addDirectory(new File("in/extra-infos"));
+ extraInfoReader.setExcludeFiles(new File(
+ "status/extra-info-history"));
+ Iterator<DescriptorFile> extraInfoFiles =
+ extraInfoReader.readDescriptors();
+ List<String> columns = Arrays.asList(
+ "complete,min,d1,d2,q1,d3,d4,md,d6,d7,q3,d8,d9,max".split(","));
+ while (extraInfoFiles.hasNext()) {
+ DescriptorFile extraInfoFile = extraInfoFiles.next();
+ if (extraInfoFile.getDescriptors() == null) {
+ continue;
+ }
+ for (Descriptor descriptor : extraInfoFile.getDescriptors()) {
+ ExtraInfoDescriptor extraInfoDescriptor =
+ (ExtraInfoDescriptor) descriptor;
+ SortedMap<String, Integer> dirreqV3TunneledDl =
+ extraInfoDescriptor.getDirreqV3TunneledDl();
+ if (dirreqV3TunneledDl == null ||
+ !dirreqV3TunneledDl.keySet().containsAll(columns)) {
+ continue;
+ }
+ long dirreqStatsEndMillis = extraInfoDescriptor.
+ getDirreqStatsEndMillis();
+ String dirreqStatsEndDate = dateFormatter.format(
+ dirreqStatsEndMillis);
+ String dirreqStatsEndDateTime = dateTimeFormatter.format(
+ dirreqStatsEndMillis);
+ String fingerprint = extraInfoDescriptor.getFingerprint();
+ File statusFile = new File("status/" + dirreqStatsEndDate);
+ statusFile.getParentFile().mkdirs();
+ BufferedWriter bw = new BufferedWriter(new FileWriter(statusFile,
+ true));
+ bw.write(fingerprint + "," + dirreqStatsEndDateTime);
+ for (String column : columns) {
+ bw.write("," + String.valueOf(dirreqV3TunneledDl.get(column)));
+ }
+ bw.write("\n");
+ bw.close();
+ }
+ }
+
+ /* Iterate over files status/YYYY-MM-DD, aggregate client speed
+ * statistics, and write them to out/client-speed-trends.csv. */
+ System.out.println(new Date() + " Writing "
+ + "out/client-speed-trends.csv ...");
+ File clientSpeedTrendsFile = new File("out/client-speed-trends.csv");
+ if (!clientSpeedTrendsFile.exists()) {
+ clientSpeedTrendsFile.getParentFile().mkdirs();
+ BufferedWriter bw = new BufferedWriter(new FileWriter(
+ clientSpeedTrendsFile));
+ bw.write("date,lines,complete,p0,p10,p20,p25,p30,p40,p50,p60,p70,"
+ + "p75,p80,p90,p100\n");
+ for (File statusFile : new File("status").listFiles()) {
+ if (!statusFile.getName().startsWith("20")) {
+ continue;
+ }
+ BufferedReader br = new BufferedReader(new FileReader(
+ statusFile));
+ String line;
+ Set<String> statsLines = new HashSet<String>();
+ List<Integer> completes = new ArrayList<Integer>();
+ while ((line = br.readLine()) != null) {
+ int complete = Integer.parseInt(line.split(",")[2]);
+ completes.add(complete);
+ statsLines.add(line);
+ }
+ br.close();
+ Collections.sort(completes);
+ int completeP50 = completes.get((completes.size() - 1) / 2);
+ List<Integer> downloadTimes = new ArrayList<Integer>();
+ List<Integer> percentiles = Arrays.asList(new Integer[] { 0, 10,
+ 20, 25, 30, 40, 50, 60, 70, 75, 80, 90, 100 });
+ int containedStatsLines = 0;
+ for (String statsLine : statsLines) {
+ String[] parts = statsLine.split(",");
+ int complete = Integer.parseInt(parts[2]);
+ if (complete < completeP50) {
+ continue;
+ }
+ containedStatsLines++;
+ int previousDownloadTime = Integer.parseInt(parts[3]);
+ downloadTimes.add(previousDownloadTime);
+ int previousPercentile = 0;
+ for (int i = 4; i < parts.length; i++) {
+ int currentDownloadTime = Integer.parseInt(parts[i]);
+ int currentPercentile = percentiles.get(i - 3);
+ int values = complete * (currentPercentile
+ - previousPercentile) / 100;
+ if (values > 0) {
+ int increment = (currentDownloadTime - previousDownloadTime)
+ / values;
+ int downloadTime = previousDownloadTime;
+ for (int j = 0; j < values; j++) {
+ downloadTime += increment;
+ downloadTimes.add(downloadTime);
+ }
+ }
+ previousDownloadTime = currentDownloadTime;
+ previousPercentile = currentPercentile;
+ }
+ }
+ bw.write(statusFile.getName() + "," + containedStatsLines + ","
+ + downloadTimes.size());
+ Collections.sort(downloadTimes);
+ for (int percentile : percentiles) {
+ int downloadTime = downloadTimes.get(((percentile
+ * (downloadTimes.size() - 1)) / 100));
+ bw.write("," + downloadTime);
+ }
+ bw.write("\n");
+ }
+ bw.close();
+ }
+
+ System.out.println(new Date() + " Terminating.");
+ }
+}
+
diff --git a/task-3260/README b/task-3260/README
new file mode 100644
index 0000000..0843c71
--- /dev/null
+++ b/task-3260/README
@@ -0,0 +1,24 @@
+Extract client speed trends from download times of network status
+consensuses.
+==========================================================================
+
+Clone the metrics-lib repository, create the descriptor.jar file, and put
+it in this directory.
+
+Obtain the Apache Commons Codec 1.4 .jar file commons-codec-1.4.jar and
+put it in this directory.
+
+Download metrics tarballs containing extra-info descriptors. Extract the
+tarballs and put them in in/extra-infos/ in this directory.
+
+Compile and run the Java class:
+
+ $ ./run.sh
+
+In order to re-run parts of the analysis, delete files in status/ or the
+results file in out/.
+
+Draw graphs using R and ggplot2:
+
+ $ R --slave -f client-speed-trends.R
+
diff --git a/task-3260/client-speed-trends.R b/task-3260/client-speed-trends.R
new file mode 100644
index 0000000..8f389e0
--- /dev/null
+++ b/task-3260/client-speed-trends.R
@@ -0,0 +1,30 @@
+library(ggplot2)
+c <- read.csv("out/client-speed-trends.csv", stringsAsFactors = FALSE)
+c <- c[as.Date(c$date) < max(as.Date(c$date)) - 1, ]
+
+ggplot(c, aes(x = as.Date(date), y = lines * 2)) +
+geom_line() +
+scale_y_continuous(name = "Directory mirrors reporting statistics\n",
+ limits = c(0, max(c$lines) * 2))
+
+ggplot(c, aes(x = as.Date(date), y = complete)) +
+geom_line()
+
+d <- c
+d[d$lines < 5, c("lines", "p10", "p20", "p30", "p40", "p50")] <- NA
+d <- d[, c("date", "lines", "p10", "p20", "p30", "p40", "p50")]
+d <- melt(d, id.vars = c("date", "lines"))
+ggplot(d, aes(x = as.Date(date), y = value / 1024, colour = variable,
+ alpha = log(lines))) +
+geom_line() +
+scale_x_date(name = "", limits = as.Date(c(min(d[!is.na(d$value), "date"]),
+ max(d[!is.na(d$value), "date"]))), format = "%Y") +
+scale_y_continuous(name = "", limits = c(0, 200)) +
+ #limits = c(0, max(d$value, na.rm = TRUE) / 1024)) +
+scale_colour_hue(name = "Percentile",
+ breaks = c("p50", "p40", "p30", "p20", "p10"),
+ labels = seq(50, 10, -10), h.start = 60) +
+scale_alpha(legend = FALSE) +
+opts(title = "Estimated client bandwidth in KiB/s\n")
+ggsave("client-speed-trends.png", width = 8, height = 5, dpi = 72)
+
diff --git a/task-3260/run.sh b/task-3260/run.sh
new file mode 100755
index 0000000..63a2c98
--- /dev/null
+++ b/task-3260/run.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+javac -cp descriptor.jar:commons-codec-1.4.jar ExtractClientSpeedTrends.java && java -Xmx4000m -cp descriptor.jar:commons-codec-1.4.jar:. ExtractClientSpeedTrends
+
_______________________________________________
tor-commits mailing list
tor-commits@xxxxxxxxxxxxxxxxxxxx
https://lists.torproject.org/cgi-bin/mailman/listinfo/tor-commits