[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]
[tor-commits] [metrics-web/master] Add webstats module and webstats-tb graph.
commit 1c0ec1e13a507baa9621156645d2dc28d85c8748
Author: Karsten Loesing <karsten.loesing@xxxxxxx>
Date: Mon Jan 23 20:09:38 2017 +0100
Add webstats module and webstats-tb graph.
Implements #21236.
---
modules/webstats/.gitignore | 3 +
modules/webstats/build.xml | 27 ++
.../java/org/torproject/metrics/webstats/Main.java | 406 +++++++++++++++++++++
.../webstats/src/main/resources/init-webstats.sql | 164 +++++++++
modules/webstats/src/main/resources/write-RData.R | 9 +
.../org/torproject/metrics/webstats/MainTest.java | 93 +++++
shared/bin/90-run-webstats-stats.sh | 10 +
shared/bin/99-copy-stats-files.sh | 2 +
shared/build-base.xml | 42 +++
shared/build.xml | 2 +-
website/etc/categories.json | 5 +-
website/etc/metrics.json | 14 +
website/etc/web.xml | 4 +
website/rserve/graphs.R | 31 ++
.../metrics/web/research/ResearchStatsServlet.java | 1 +
website/web/WEB-INF/sources.jsp | 6 +
website/web/WEB-INF/stats.jsp | 34 +-
17 files changed, 849 insertions(+), 4 deletions(-)
diff --git a/modules/webstats/.gitignore b/modules/webstats/.gitignore
new file mode 100644
index 0000000..a8e4d02
--- /dev/null
+++ b/modules/webstats/.gitignore
@@ -0,0 +1,3 @@
+/stats/*.csv
+/RData/*.RData
+
diff --git a/modules/webstats/build.xml b/modules/webstats/build.xml
new file mode 100644
index 0000000..bcfe251
--- /dev/null
+++ b/modules/webstats/build.xml
@@ -0,0 +1,27 @@
+<project default="run" name="webstats" basedir=".">
+
+ <property name="sources" value="src/main/java"/>
+ <property name="testsources" value="src/test/java"/>
+
+ <include file="../../shared/build-base.xml" as="basetask"/>
+ <target name="clean" depends="basetask.clean"/>
+ <target name="compile" depends="basetask.compile"/>
+ <target name="test" depends="basetask.test"/>
+
+ <path id="classpath">
+ <pathelement path="${classes}"/>
+ <path refid="base.classpath" />
+ <fileset dir="${libs}">
+ <include name="postgresql-jdbc3-9.2.jar"/>
+ </fileset>
+ </path>
+
+ <target name="run" depends="compile">
+ <java fork="true"
+ maxmemory="1g"
+ classname="org.torproject.metrics.webstats.Main">
+ <classpath refid="classpath"/>
+ </java>
+ </target>
+</project>
+
diff --git a/modules/webstats/src/main/java/org/torproject/metrics/webstats/Main.java b/modules/webstats/src/main/java/org/torproject/metrics/webstats/Main.java
new file mode 100644
index 0000000..b6e2f96
--- /dev/null
+++ b/modules/webstats/src/main/java/org/torproject/metrics/webstats/Main.java
@@ -0,0 +1,406 @@
+/* Copyright 2016--2017 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.metrics.webstats;
+
+import org.apache.commons.compress.compressors.xz.XZCompressorInputStream;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.URL;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.sql.Connection;
+import java.sql.Date;
+import java.sql.DriverManager;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.text.DateFormat;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.SortedSet;
+import java.util.TimeZone;
+import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/** Main class of the webstats module that downloads log files from the server,
+ * imports them into a database, and exports aggregate statistics to a CSV
+ * file. */
+public class Main {
+
+ /** Logger for this class. */
+ private static Logger log = LoggerFactory.getLogger(Main.class);
+
+ /** Pattern for links contained in directory listings. */
+ static final Pattern URL_STRING_PATTERN =
+ Pattern.compile(".*<a href=\"([^\"]+)\">.*");
+
+ static final Pattern LOG_FILE_URL_PATTERN =
+ Pattern.compile("^.*/([^/]+)/([^/]+)-access.log-(\\d{8}).xz$");
+
+ private static DateFormat logDateFormat;
+
+ static {
+ logDateFormat = new SimpleDateFormat("yyyyMMdd");
+ logDateFormat.setLenient(false);
+ logDateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
+ }
+
+ static final Pattern LOG_LINE_PATTERN = Pattern.compile(
+ "^0.0.0.[01] - - \\[\\d{2}/\\w{3}/\\d{4}:00:00:00 \\+0000\\] "
+ + "\"(GET|HEAD) ([^ ]+) HTTP[^ ]+\" (\\d+) (-|\\d+) \"-\" \"-\" -$");
+
+ private static final String LOG_DATE = "log_date";
+
+ private static final String REQUEST_TYPE = "request_type";
+
+ private static final String PLATFORM = "platform";
+
+ private static final String CHANNEL = "channel";
+
+ private static final String LOCALE = "locale";
+
+ private static final String INCREMENTAL = "incremental";
+
+ private static final String COUNT = "count";
+
+ private static final String ALL_COLUMNS = LOG_DATE + "," + REQUEST_TYPE + ","
+ + PLATFORM + "," + CHANNEL + "," + LOCALE + "," + INCREMENTAL + ","
+ + COUNT;
+
+ /** Executes this data-processing module. */
+ public static void main(String[] args) throws Exception {
+ log.info("Starting webstats module.");
+ String dbUrlString = "jdbc:postgresql:webstats";
+ Connection connection = connectToDatabase(dbUrlString);
+ SortedSet<String> previouslyImportedLogFileUrls =
+ queryImportedFiles(connection);
+ String baseUrl = "https://webstats.torproject.org/out/";
+ SortedSet<String> newLogFileUrls = downloadDirectoryListings(baseUrl,
+ previouslyImportedLogFileUrls);
+ importLogFiles(connection, newLogFileUrls);
+ SortedSet<String> statistics = queryWebstats(connection);
+ writeStatistics(Paths.get("stats", "webstats.csv"), statistics);
+ disconnectFromDatabase(connection);
+ log.info("Terminated webstats module.");
+ }
+
+ private static Connection connectToDatabase(String jdbcString)
+ throws SQLException {
+ log.info("Connecting to database.");
+ Connection connection = DriverManager.getConnection(jdbcString);
+ connection.setAutoCommit(false);
+ log.info("Successfully connected to database.");
+ return connection;
+ }
+
+ static SortedSet<String> queryImportedFiles(Connection connection)
+ throws SQLException {
+ log.info("Querying URLs of previously imported log files.");
+ SortedSet<String> importedLogFileUrls = new TreeSet<>();
+ Statement st = connection.createStatement();
+ String queryString = "SELECT url FROM files";
+ try (ResultSet rs = st.executeQuery(queryString)) {
+ while (rs.next()) {
+ importedLogFileUrls.add(rs.getString(1));
+ }
+ }
+ log.info("Found {} URLs of previously imported log files.",
+ importedLogFileUrls.size());
+ return importedLogFileUrls;
+ }
+
+ static SortedSet<String> downloadDirectoryListings(String baseUrl,
+ SortedSet<String> importedLogFileUrls) throws IOException {
+ log.info("Downloading directory listings from {}.", baseUrl);
+ List<String> directoryListings = new ArrayList<>();
+ directoryListings.add(baseUrl);
+ SortedSet<String> newLogFileUrls = new TreeSet<>();
+ while (!directoryListings.isEmpty()) {
+ String urlString = directoryListings.remove(0);
+ if (urlString.endsWith("/")) {
+ directoryListings.addAll(downloadDirectoryListing(urlString));
+ } else if (!urlString.endsWith(".xz")) {
+ log.debug("Skipping unrecognized URL {}.", urlString);
+ } else if (!importedLogFileUrls.contains(urlString)) {
+ newLogFileUrls.add(urlString);
+ }
+ }
+ log.info("Found {} URLs of log files that have not yet been imported.",
+ newLogFileUrls.size());
+ return newLogFileUrls;
+ }
+
+ static List<String> downloadDirectoryListing(String urlString)
+ throws IOException {
+ log.debug("Downloading directory listing from {}.", urlString);
+ List<String> urlStrings = new ArrayList<>();
+ try (BufferedReader br = new BufferedReader(new InputStreamReader(
+ new URL(urlString).openStream()))) {
+ String line;
+ while ((line = br.readLine()) != null) {
+ Matcher matcher = URL_STRING_PATTERN.matcher(line);
+ if (matcher.matches() && !matcher.group(1).startsWith("/")) {
+ urlStrings.add(urlString + matcher.group(1));
+ }
+ }
+ }
+ return urlStrings;
+ }
+
+ static void importLogFiles(Connection connection,
+ SortedSet<String> newLogFileUrls) {
+ log.info("Downloading, parsing, and importing {} log files.",
+ newLogFileUrls.size());
+ for (String urlString : newLogFileUrls) {
+ try {
+ Object[] metaData = parseMetaData(urlString);
+ if (metaData == null) {
+ continue;
+ }
+ List<String> downloadedLogLines = downloadLogFile(urlString);
+ Map<String, Integer> parsedLogLines = parseLogLines(urlString,
+ downloadedLogLines);
+ importLogLines(connection, urlString, metaData, parsedLogLines);
+ } catch (IOException | ParseException exc) {
+ log.warn("Cannot download or parse log file with URL {}. Retrying "
+ + "in the next run.", urlString, exc);
+ } catch (SQLException exc) {
+ log.warn("Cannot import log file with URL {} into the database. "
+ + "Rolling back and retrying in the next run.", urlString, exc);
+ try {
+ connection.rollback();
+ } catch (SQLException exceptionWhileRollingBack) {
+ /* Ignore. */
+ }
+ }
+ }
+ }
+
+ private static Object[] parseMetaData(String urlString)
+ throws ParseException {
+ log.debug("Importing log file {}.", urlString);
+ if (urlString.contains("-ssl-access.log-")) {
+ log.debug("Skipping log file containing SSL requests with URL {}.",
+ urlString);
+ return null;
+ }
+ Matcher logFileUrlMatcher = LOG_FILE_URL_PATTERN.matcher(urlString);
+ if (!logFileUrlMatcher.matches()) {
+ log.debug("Skipping log file with unrecognized URL {}.", urlString);
+ return null;
+ }
+ String server = logFileUrlMatcher.group(1);
+ String site = logFileUrlMatcher.group(2);
+ long logDateMillis = logDateFormat.parse(logFileUrlMatcher.group(3))
+ .getTime();
+ return new Object[] { server, site, new Long(logDateMillis) };
+ }
+
+ static List<String> downloadLogFile(String urlString) throws IOException {
+ List<String> downloadedLogLines = new ArrayList<>();
+ try (BufferedReader br = new BufferedReader(new InputStreamReader(
+ new XZCompressorInputStream(new URL(urlString).openStream())))) {
+ String line;
+ while ((line = br.readLine()) != null) {
+ downloadedLogLines.add(line);
+ }
+ }
+ return downloadedLogLines;
+ }
+
+ static Map<String, Integer> parseLogLines(String urlString,
+ List<String> logLines) {
+ int skippedLines = 0;
+ Map<String, Integer> parsedLogLines = new HashMap<>();
+ for (String logLine : logLines) {
+ Matcher logLineMatcher = LOG_LINE_PATTERN.matcher(logLine);
+ if (!logLineMatcher.matches()) {
+ skippedLines++;
+ continue;
+ }
+ String method = logLineMatcher.group(1);
+ String resource = logLineMatcher.group(2);
+ int responseCode = Integer.parseInt(logLineMatcher.group(3));
+ String combined = String.format("%s %s %d", method, resource,
+ responseCode);
+ if (!parsedLogLines.containsKey(combined)) {
+ parsedLogLines.put(combined, 1);
+ } else {
+ parsedLogLines.put(combined, parsedLogLines.get(combined) + 1);
+ }
+ }
+ if (skippedLines > 0) {
+ log.debug("Skipped {} lines while parsing log file {}.", skippedLines,
+ urlString);
+ }
+ return parsedLogLines;
+ }
+
+ private static void importLogLines(Connection connection, String urlString,
+ Object[] metaData, Map<String, Integer> parsedLogLines)
+ throws SQLException {
+ PreparedStatement psFiles = connection.prepareStatement(
+ "INSERT INTO files (url, server, site, " + LOG_DATE + ") "
+ + "VALUES (?, ?, ?, ?)", Statement.RETURN_GENERATED_KEYS);
+ PreparedStatement psResourcesSelect = connection.prepareStatement(
+ "SELECT resource_id FROM resources WHERE resource_string = ?");
+ PreparedStatement psResourcesInsert = connection.prepareStatement(
+ "INSERT INTO resources (resource_string) VALUES (?)",
+ Statement.RETURN_GENERATED_KEYS);
+ PreparedStatement psRequests = connection.prepareStatement(
+ "INSERT INTO requests (file_id, method, resource_id, response_code, "
+ + COUNT + ") VALUES (?, CAST(? AS method), ?, ?, ?)");
+ String server = (String) metaData[0];
+ String site = (String) metaData[1];
+ long logDateMillis = (long) metaData[2];
+ int fileId = insertFile(psFiles, urlString, server, site, logDateMillis);
+ if (fileId < 0) {
+ log.debug("Skipping previously imported log file {}.", urlString);
+ return;
+ }
+ for (Map.Entry<String, Integer> requests : parsedLogLines.entrySet()) {
+ String[] keyParts = requests.getKey().split(" ");
+ String method = keyParts[0];
+ String resource = keyParts[1];
+ int responseCode = Integer.parseInt(keyParts[2]);
+ int count = requests.getValue();
+ int resourceId = insertResource(psResourcesSelect, psResourcesInsert,
+ resource);
+ if (resourceId < 0) {
+ log.error("Could not retrieve auto-generated key for new resources "
+ + "entry.");
+ connection.rollback();
+ return;
+ }
+ insertRequest(psRequests, fileId, method, resourceId, responseCode,
+ count);
+ }
+ connection.commit();
+ log.debug("Finished importing log file with URL {} into database.",
+ urlString);
+ }
+
+ private static int insertFile(PreparedStatement psFiles, String urlString,
+ String server, String site, long logDateMillis) throws SQLException {
+ int fileId = -1;
+ psFiles.clearParameters();
+ psFiles.setString(1, truncateString(urlString, 2048));
+ psFiles.setString(2, truncateString(server, 32));
+ psFiles.setString(3, truncateString(site, 128));
+ psFiles.setDate(4, new Date(logDateMillis));
+ psFiles.execute();
+ try (ResultSet rs = psFiles.getGeneratedKeys()) {
+ if (rs.next()) {
+ fileId = rs.getInt(1);
+ }
+ }
+ return fileId;
+ }
+
+ private static void insertRequest(PreparedStatement psRequests, int fileId,
+ String method, int resourceId, int responseCode, int count)
+ throws SQLException {
+ psRequests.clearParameters();
+ psRequests.setInt(1, fileId);
+ psRequests.setString(2, method);
+ psRequests.setInt(3, resourceId);
+ psRequests.setInt(4, responseCode);
+ psRequests.setInt(5, count);
+ psRequests.execute();
+ }
+
+ private static int insertResource(PreparedStatement psResourcesSelect,
+ PreparedStatement psResourcesInsert, String resource)
+ throws SQLException {
+ int resourceId = -1;
+ String truncatedResource = truncateString(resource, 2048);
+ psResourcesSelect.clearParameters();
+ psResourcesSelect.setString(1, truncatedResource);
+ try (ResultSet rs = psResourcesSelect.executeQuery()) {
+ if (rs.next()) {
+ resourceId = rs.getInt(1);
+ }
+ }
+ if (resourceId < 0) {
+ /* There's a small potential for a race condition between the previous
+ * SELECT and this INSERT INTO, but that will be resolved by the UNIQUE
+ * constraint when committing the transaction. */
+ psResourcesInsert.clearParameters();
+ psResourcesInsert.setString(1, truncatedResource);
+ psResourcesInsert.execute();
+ try (ResultSet rs = psResourcesInsert.getGeneratedKeys()) {
+ if (rs.next()) {
+ resourceId = rs.getInt(1);
+ }
+ }
+ }
+ return resourceId;
+ }
+
+ private static String truncateString(String originalString,
+ int truncateAfter) {
+ if (originalString.length() > truncateAfter) {
+ originalString = originalString.substring(0, truncateAfter);
+ }
+ return originalString;
+ }
+
+ static SortedSet<String> queryWebstats(Connection connection)
+ throws SQLException {
+ log.info("Querying statistics from database.");
+ SortedSet<String> statistics = new TreeSet<>();
+ Statement st = connection.createStatement();
+ String queryString = "SELECT " + ALL_COLUMNS + " FROM webstats";
+ DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd", Locale.US);
+ dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
+ try (ResultSet rs = st.executeQuery(queryString)) {
+ while (rs.next()) {
+ statistics.add(String.format("%s,%s,%s,%s,%s,%s,%d",
+ dateFormat.format(rs.getDate(LOG_DATE)),
+ emptyNull(rs.getString(REQUEST_TYPE)),
+ emptyNull(rs.getString(PLATFORM)),
+ emptyNull(rs.getString(CHANNEL)),
+ emptyNull(rs.getString(LOCALE)),
+ emptyNull(rs.getString(INCREMENTAL)),
+ rs.getLong(COUNT)));
+ }
+ }
+ return statistics;
+ }
+
+ private static String emptyNull(String text) {
+ return null == text ? "" : text;
+ }
+
+ static void writeStatistics(Path webstatsPath,
+ SortedSet<String> statistics) throws IOException {
+ webstatsPath.toFile().getParentFile().mkdirs();
+ List<String> lines = new ArrayList<String>();
+ lines.add(ALL_COLUMNS);
+ lines.addAll(statistics);
+ log.info("Writing {} lines to {}.", lines.size(),
+ webstatsPath.toFile().getAbsolutePath());
+ Files.write(webstatsPath, lines, StandardCharsets.UTF_8);
+ }
+
+ private static void disconnectFromDatabase(Connection connection)
+ throws SQLException {
+ log.info("Disconnecting from database.");
+ connection.close();
+ }
+}
+
diff --git a/modules/webstats/src/main/resources/init-webstats.sql b/modules/webstats/src/main/resources/init-webstats.sql
new file mode 100644
index 0000000..98bb758
--- /dev/null
+++ b/modules/webstats/src/main/resources/init-webstats.sql
@@ -0,0 +1,164 @@
+-- Copyright 2016--2017 The Tor Project
+-- See LICENSE for licensing information
+
+CREATE TYPE method AS ENUM ('GET', 'HEAD');
+
+CREATE TABLE files (
+ file_id SERIAL PRIMARY KEY,
+ url CHARACTER VARYING(2048) UNIQUE NOT NULL,
+ server CHARACTER VARYING(32) NOT NULL,
+ site CHARACTER VARYING(128) NOT NULL,
+ log_date DATE NOT NULL,
+ UNIQUE (server, site, log_date)
+);
+
+CREATE TABLE resources (
+ resource_id SERIAL PRIMARY KEY,
+ resource_string CHARACTER VARYING(2048) UNIQUE NOT NULL
+);
+
+CREATE TABLE requests (
+ file_id INTEGER REFERENCES files (file_id) NOT NULL,
+ method METHOD NOT NULL,
+ resource_id INTEGER REFERENCES resources (resource_id) NOT NULL,
+ response_code SMALLINT NOT NULL,
+ count INTEGER NOT NULL,
+ UNIQUE (file_id, method, resource_id, response_code)
+);
+
+CREATE OR REPLACE VIEW webstats AS
+ SELECT log_date,
+ CASE WHEN resource_string LIKE '%.asc' THEN 'tbsd'
+ ELSE 'tbid' END AS request_type,
+ CASE WHEN resource_string LIKE '%.exe%' THEN 'w'
+ WHEN resource_string LIKE '%.dmg%' THEN 'm'
+ WHEN resource_string LIKE '%.tar.xz%' THEN 'l'
+ ELSE 'o' END AS platform,
+ CASE WHEN resource_string LIKE '%-hardened%' THEN 'h'
+ WHEN resource_string LIKE '%/%.%a%/%' THEN 'a'
+ ELSE 'r' END AS channel,
+ COALESCE(SUBSTRING(resource_string
+ FROM '.*_([a-zA-Z]{2}|[a-zA-Z]{2}-[a-zA-Z]{2})[\._-].*'), '??') AS locale,
+ NULL::BOOLEAN AS incremental,
+ SUM(count) AS count
+ FROM files NATURAL JOIN requests NATURAL JOIN resources
+ WHERE (resource_string LIKE '%/torbrowser/%.exe'
+ OR resource_string LIKE '%/torbrowser/%.dmg'
+ OR resource_string LIKE '%/torbrowser/%.tar.xz'
+ OR resource_string LIKE '%/torbrowser/%.exe.asc'
+ OR resource_string LIKE '%/torbrowser/%.dmg.asc'
+ OR resource_string LIKE '%/torbrowser/%.tar.xz.asc')
+ AND response_code = 200
+ AND method = 'GET'
+ GROUP BY log_date, request_type, platform, channel, locale, incremental
+ UNION
+ SELECT log_date,
+ 'tbup' AS request_type,
+ CASE WHEN resource_string LIKE '%/WINNT%' THEN 'w'
+ WHEN resource_string LIKE '%/Darwin%' THEN 'm'
+ ELSE 'l' END AS platform,
+ CASE WHEN resource_string LIKE '%/hardened/%' THEN 'h'
+ WHEN resource_string LIKE '%/alpha/%' THEN 'a'
+ WHEN resource_string LIKE '%/release/%' THEN 'r'
+ ELSE 'o' END AS channel,
+ COALESCE(SUBSTRING(resource_string
+ FROM '.*/([a-zA-Z]{2}|[a-zA-Z]{2}-[a-zA-Z]{2})\??$'), '??') AS locale,
+ NULL::BOOLEAN AS incremental,
+ SUM(count) AS count
+ FROM files NATURAL JOIN requests NATURAL JOIN resources
+ WHERE resource_string LIKE '%/torbrowser/update_2/%'
+ AND resource_string NOT LIKE '%.xml'
+ AND response_code = 200
+ AND method = 'GET'
+ GROUP BY log_date, request_type, platform, channel, locale, incremental
+ UNION
+ SELECT log_date,
+ 'tbur' AS request_type,
+ CASE WHEN resource_string LIKE '%-win32-%' THEN 'w'
+ WHEN resource_string LIKE '%-osx%' THEN 'm'
+ ELSE 'l' END AS platform,
+ CASE WHEN resource_string LIKE '%-hardened%' THEN 'h'
+ WHEN resource_string LIKE '%/%.%a%/%' THEN 'a'
+ ELSE 'r' END AS channel,
+ COALESCE(SUBSTRING(resource_string
+ FROM '.*_([a-zA-Z]{2}|[a-zA-Z]{2}-[a-zA-Z]{2})[\._-].*'), '??') AS locale,
+ CASE WHEN resource_string LIKE '%.incremental.%' THEN TRUE
+ ELSE FALSE END AS incremental,
+ SUM(count) AS count
+ FROM files NATURAL JOIN requests NATURAL JOIN resources
+ WHERE resource_string LIKE '%/torbrowser/%.mar'
+ AND response_code = 302
+ AND method = 'GET'
+ GROUP BY log_date, request_type, platform, channel, locale, incremental
+ UNION
+ SELECT log_date,
+ 'tmid' AS request_type,
+ CASE WHEN resource_string LIKE '%.exe' THEN 'w'
+ WHEN resource_string LIKE '%.dmg' THEN 'm'
+ WHEN resource_string LIKE '%.tar.xz' THEN 'l'
+ ELSE 'o' END AS platform,
+ NULL AS channel,
+ COALESCE(SUBSTRING(resource_string
+ FROM '.*_([a-zA-Z]{2}|[a-zA-Z]{2}-[a-zA-Z]{2})[\._-].*'), '??') AS locale,
+ NULL::BOOLEAN AS incremental,
+ SUM(count) AS count
+ FROM files NATURAL JOIN requests NATURAL JOIN resources
+ WHERE (resource_string LIKE '%/tormessenger/%.exe'
+ OR resource_string LIKE '%/tormessenger/%.dmg'
+ OR resource_string LIKE '%/tormessenger/%.tar.xz')
+ AND response_code = 200
+ AND method = 'GET'
+ GROUP BY log_date, request_type, platform, channel, locale, incremental
+ UNION
+ SELECT log_date,
+ 'tmup' AS request_type,
+ CASE WHEN resource_string LIKE '%/WINNT%' THEN 'w'
+ WHEN resource_string LIKE '%/Darwin%' THEN 'm'
+ WHEN resource_string LIKE '%/Linux%' THEN 'l'
+ ELSE 'o' END AS platform,
+ NULL AS channel,
+ COALESCE(SUBSTRING(resource_string
+ FROM '.*/([a-zA-Z]{2}|[a-zA-Z]{2}-[a-zA-Z]{2})\??$'), '??') AS locale,
+ NULL::BOOLEAN AS incremental,
+ SUM(count) AS count
+ FROM files NATURAL JOIN requests NATURAL JOIN resources
+ WHERE resource_string LIKE '%/tormessenger/update_2/%'
+ AND resource_string NOT LIKE '%.xml'
+ AND resource_string NOT LIKE '%/'
+ AND resource_string NOT LIKE '%/?'
+ AND response_code = 200
+ AND method = 'GET'
+ GROUP BY log_date, request_type, platform, channel, locale, incremental
+ UNION
+ SELECT log_date,
+ 'twhph' AS request_type,
+ NULL AS platform,
+ NULL AS channel,
+ NULL AS locale,
+ NULL::BOOLEAN AS incremental,
+ SUM(count) AS count
+ FROM files NATURAL JOIN requests NATURAL JOIN resources
+ WHERE (resource_string = '/'
+ OR resource_string LIKE '/index%')
+ AND response_code = 200
+ AND (site = 'torproject.org'
+ OR site = 'www.torproject.org')
+ AND method = 'GET'
+ GROUP BY log_date, request_type, platform, channel, locale, incremental
+ UNION
+ SELECT log_date,
+ 'twdph' AS request_type,
+ NULL AS platform,
+ NULL AS channel,
+ NULL AS locale,
+ NULL::BOOLEAN AS incremental,
+ SUM(count) AS count
+ FROM files NATURAL JOIN requests NATURAL JOIN resources
+ WHERE (resource_string LIKE '/download/download%'
+ OR resource_string LIKE '/projects/torbrowser.html%')
+ AND response_code = 200
+ AND (site = 'torproject.org'
+ OR site = 'www.torproject.org')
+ AND method = 'GET'
+ GROUP BY log_date, request_type, platform, channel, locale, incremental;
+
diff --git a/modules/webstats/src/main/resources/write-RData.R b/modules/webstats/src/main/resources/write-RData.R
new file mode 100644
index 0000000..2cb8917
--- /dev/null
+++ b/modules/webstats/src/main/resources/write-RData.R
@@ -0,0 +1,9 @@
+dir.create("RData", showWarnings = FALSE)
+
+d <- read.csv("stats/webstats.csv", stringsAsFactors = FALSE)
+d <- d[d$request_type %in% c('tbid', 'tbsd', 'tbup', 'tbur'), ]
+data <- aggregate(list(count = d$count),
+ by = list(log_date = as.Date(d$log_date), request_type = d$request_type),
+ FUN = sum)
+save(data, file = "RData/webstats-tb.RData")
+
diff --git a/modules/webstats/src/test/java/org/torproject/metrics/webstats/MainTest.java b/modules/webstats/src/test/java/org/torproject/metrics/webstats/MainTest.java
new file mode 100644
index 0000000..1c4f0bc
--- /dev/null
+++ b/modules/webstats/src/test/java/org/torproject/metrics/webstats/MainTest.java
@@ -0,0 +1,93 @@
+/* Copyright 2017 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.metrics.webstats;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import org.junit.Test;
+
+import java.util.regex.Matcher;
+
+public class MainTest {
+
+ static final String SAMPLE_LOG_FILE_NAME =
+ "metrics.torproject.org-access.log-20170117.xz";
+
+ static final String SAMPLE_SUBDIRECTORY_NAME = "meronense.torproject.org/";
+
+ static final String SAMPLE_LOG_FILE_URL =
+ "https://webstats.torproject.org/out/meronense.torproject.org/"
+ + "metrics.torproject.org-access.log-20170117.xz";
+
+ static final String[] SAMPLE_LOG_LINES = new String[] {
+ "0.0.0.0 - - [17/Jan/2017:00:00:00 +0000] "
+ + "\"GET / HTTP/1.0\" 200 10532 \"-\" \"-\" -",
+ "0.0.0.0 - - [17/Jan/2017:00:00:00 +0000] "
+ + "\"HEAD /bubbles.html HTTP/1.1\" 200 - \"-\" \"-\" -"
+ };
+
+ @Test
+ public void testUrlStringPatternComplete() {
+ Matcher matcher = Main.URL_STRING_PATTERN.matcher(
+ "<img src=\"/icons/unknown.gif\" alt=\"[ ]\"> "
+ + "<a href=\"" + SAMPLE_LOG_FILE_NAME + "\">" + SAMPLE_LOG_FILE_NAME
+ + "</a> 2017-01-19 19:43 5.6K ");
+ assertTrue(matcher.matches());
+ assertEquals(SAMPLE_LOG_FILE_NAME, matcher.group(1));
+ }
+
+ @Test
+ public void testUrlStringPatternOnlyATag() {
+ Matcher matcher = Main.URL_STRING_PATTERN.matcher("<a href=\""
+ + SAMPLE_LOG_FILE_NAME + "\">" + SAMPLE_LOG_FILE_NAME + "</a>");
+ assertTrue(matcher.matches());
+ assertEquals(SAMPLE_LOG_FILE_NAME, matcher.group(1));
+ }
+
+ @Test
+ public void testUrlStringPatternSubdirectory() {
+ Matcher matcher = Main.URL_STRING_PATTERN.matcher(
+ "<a href=\"" + SAMPLE_SUBDIRECTORY_NAME + "\">"
+ + SAMPLE_SUBDIRECTORY_NAME + "/</a>");
+ assertTrue(matcher.matches());
+ assertEquals(SAMPLE_SUBDIRECTORY_NAME, matcher.group(1));
+ }
+
+ @Test
+ public void testUrlStringPatternAnythingBetweenDoubleQuotesHtml() {
+ Matcher matcher = Main.URL_STRING_PATTERN.matcher(
+ "<a href=\"anything-between-double-quotes.html\">Link/</a>");
+ assertTrue(matcher.matches());
+ assertEquals("anything-between-double-quotes.html", matcher.group(1));
+ }
+
+ @Test
+ public void testLogFileUrlPatternComplete() {
+ Matcher matcher = Main.LOG_FILE_URL_PATTERN.matcher(SAMPLE_LOG_FILE_URL);
+ assertTrue(matcher.matches());
+ assertEquals("meronense.torproject.org", matcher.group(1));
+ assertEquals("metrics.torproject.org", matcher.group(2));
+ assertEquals("20170117", matcher.group(3));
+ }
+
+ @Test
+ public void testLogLinePatternGetSlash() {
+ Matcher matcher = Main.LOG_LINE_PATTERN.matcher(SAMPLE_LOG_LINES[0]);
+ assertTrue(matcher.matches());
+ assertEquals("GET", matcher.group(1));
+ assertEquals("/", matcher.group(2));
+ assertEquals("200", matcher.group(3));
+ }
+
+ @Test
+ public void testLogLinePatternHeadBubbles() {
+ Matcher matcher = Main.LOG_LINE_PATTERN.matcher(SAMPLE_LOG_LINES[1]);
+ assertTrue(matcher.matches());
+ assertEquals("HEAD", matcher.group(1));
+ assertEquals("/bubbles.html", matcher.group(2));
+ assertEquals("200", matcher.group(3));
+ }
+}
+
diff --git a/shared/bin/90-run-webstats-stats.sh b/shared/bin/90-run-webstats-stats.sh
new file mode 100755
index 0000000..37091b4
--- /dev/null
+++ b/shared/bin/90-run-webstats-stats.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+cd modules/webstats/
+
+ant run | grep "\[java\]"
+
+R --slave -f src/main/resources/write-RData.R > /dev/null 2>&1
+
+cd ../../
+
diff --git a/shared/bin/99-copy-stats-files.sh b/shared/bin/99-copy-stats-files.sh
index d236630..a828686 100755
--- a/shared/bin/99-copy-stats-files.sh
+++ b/shared/bin/99-copy-stats-files.sh
@@ -6,7 +6,9 @@ cp -a modules/advbwdist/stats/advbwdist.csv shared/stats/
cp -a modules/hidserv/stats/hidserv.csv shared/stats/
cp -a modules/clients/stats/clients*.csv shared/stats/
cp -a modules/clients/stats/userstats-combined.csv shared/stats/
+cp -a modules/webstats/stats/webstats.csv shared/stats/
mkdir -p shared/RData
cp -a modules/clients/RData/*.RData shared/RData/
+cp -a modules/webstats/RData/*.RData shared/RData/
diff --git a/shared/build-base.xml b/shared/build-base.xml
index 7051f49..759e1d0 100644
--- a/shared/build-base.xml
+++ b/shared/build-base.xml
@@ -1,9 +1,11 @@
<project basedir=".">
<property name="sources" value="src"/>
+ <property name="testsources" value="src/test/java"/>
<property name="libs" value="../../shared/lib"/>
<property name="generated" value="generated"/>
<property name="classes" value="${generated}/classes/"/>
+ <property name="testclasses" value="${generated}/test-classes/"/>
<property name="source-and-target-java-version" value="1.7" />
<property name="descriptorversion" value="1.5.0" />
@@ -21,6 +23,21 @@
</fileset>
</path>
+ <path id="base.testclasspath">
+ <pathelement path="${base.testclasses}"/>
+ <pathelement path="base.classpath"/>
+ <fileset dir="${libs}">
+ <include name="hamcrest-core-1.3.jar"/>
+ <include name="junit4-4.11.jar"/>
+ </fileset>
+ </path>
+
+ <path id="testclasspath">
+ <pathelement path="${testclasses}"/>
+ <path refid="base.testclasspath" />
+ <path refid="base.classpath" />
+ </path>
+
<target name="clean">
<delete includeEmptyDirs="true" quiet="true">
<fileset dir="${generated}" defaultexcludes="false" includes="**" />
@@ -29,6 +46,7 @@
<target name="init">
<mkdir dir="${classes}"/>
+ <mkdir dir="${testclasses}"/>
</target>
<target name="compile" depends="init" >
@@ -45,6 +63,30 @@
</javac>
</target>
+ <target name="testcompile" depends="compile" >
+ <javac destdir="${testclasses}"
+ srcdir="${testsources}"
+ source="${source-and-target-java-version}"
+ target="${source-and-target-java-version}"
+ debug="true" debuglevel="lines,source"
+ deprecation="true"
+ optimize="false"
+ failonerror="true"
+ includeantruntime="false">
+ <classpath refid="testclasspath"/>
+ </javac>
+ </target>
+
+ <target name="test" depends="testcompile">
+ <junit fork="true" haltonfailure="true" printsummary="off">
+ <classpath refid="testclasspath"/>
+ <formatter type="plain" usefile="false"/>
+ <batchtest>
+ <fileset dir="${testclasses}"
+ includes="**/*Test.class"/>
+ </batchtest>
+ </junit>
+ </target>
</project>
diff --git a/shared/build.xml b/shared/build.xml
index 13a09f7..cb51d5f 100644
--- a/shared/build.xml
+++ b/shared/build.xml
@@ -26,9 +26,9 @@
<fileset dir="../modules/clients/src" includes="**/*.java"/>
<fileset dir="../modules/collectdescs/src" includes="**/*.java"/>
<fileset dir="../modules/connbidirect/src" includes="**/*.java"/>
- <fileset dir="../modules/disagreement/src" includes="**/*.java"/>
<fileset dir="../modules/hidserv/src" includes="**/*.java"/>
<fileset dir="../modules/legacy/src" includes="**/*.java"/>
+ <fileset dir="../modules/webstats/src" includes="**/*.java"/>
<classpath>
<path refid="checkstyle.classpath" />
</classpath>
diff --git a/website/etc/categories.json b/website/etc/categories.json
index 8b4ea77..6825634 100644
--- a/website/etc/categories.json
+++ b/website/etc/categories.json
@@ -78,6 +78,9 @@
"icon": "fa-download",
"header": "Applications",
"summary": "How many Tor applications, like Tor Browser, have been downloaded or updated.",
- "metrics": []
+ "description": "The following application statistics are based on the analysis of requests to <code>torproject.org</code> web servers.",
+ "metrics": [
+ "webstats-tb"
+ ]
}
]
diff --git a/website/etc/metrics.json b/website/etc/metrics.json
index f7666be..4a97ca1 100644
--- a/website/etc/metrics.json
+++ b/website/etc/metrics.json
@@ -402,5 +402,19 @@
"title": "Network churn rate by relay flag",
"type": "Link",
"description": "<p>This image shows the churn rate of the Tor network by <a href=\"glossary.html#relay-flag\">relay flag</a> in a given month. The churn rate, a value in the interval <b>[0,1]</b>, captures the rate of <a href=\"glossary.html#relay\">relays</a> joining and leaving the network from one <a href=\"glossary.html#consensus\">consensus</a> to the next (that is, within one hour). The complete image gallery can be found on <a href=\"https://nymity.ch/sybilhunting/churn-values/\">Philipp Winter's homepage</a>.</p><p><a href=\"https://nymity.ch/sybilhunting/churn-values/\"><img src=\"images/networkchurn.png\" alt=\"Network churn rate by relay flag\"></a></p>"
+ },
+ {
+ "id": "webstats-tb",
+ "title": "Tor Browser downloads and updates",
+ "type": "Graph",
+ "description": "<p>This graph shows absolute numbers of requests to Tor's web servers by request type. It is based on data from <a href=\"https://webstats.torproject.org/\" target=\"_blank\"><code>webstats.torproject.org</code></a> which collects logs from <code>torproject.org</code> web servers and provides them as a stripped-down version of Apache's "combined" log format without IP addresses, log times, HTTP parameters, referers, and user agent strings. <em>Initial downloads</em> and <em>signature downloads</em> are requests made by the user to download a Tor Browser executable or a corresponding signature file from the Tor website. <em>Update pings</em> and <em>update requests</em> are requests made by Tor Browser to check whether a newer version is available or to download a newer version.</p>",
+ "function": "plot_webstats_tb",
+ "parameters": [
+ "start",
+ "end"
+ ],
+ "data": [
+ "webstats"
+ ]
}
]
diff --git a/website/etc/web.xml b/website/etc/web.xml
index 7444cf5..916984c 100644
--- a/website/etc/web.xml
+++ b/website/etc/web.xml
@@ -46,6 +46,7 @@
<url-pattern>/hidserv-dir-onions-seen.html</url-pattern>
<url-pattern>/hidserv-rend-relayed-cells.html</url-pattern>
<url-pattern>/hidserv-frac-reporting.html</url-pattern>
+ <url-pattern>/webstats-tb.html</url-pattern>
</servlet-mapping>
<servlet>
@@ -177,6 +178,9 @@
<url-pattern>/hidserv-frac-reporting.png</url-pattern>
<url-pattern>/hidserv-frac-reporting.pdf</url-pattern>
<url-pattern>/hidserv-frac-reporting.svg</url-pattern>
+ <url-pattern>/webstats-tb.png</url-pattern>
+ <url-pattern>/webstats-tb.pdf</url-pattern>
+ <url-pattern>/webstats-tb.svg</url-pattern>
</servlet-mapping>
<servlet>
diff --git a/website/rserve/graphs.R b/website/rserve/graphs.R
index 9f8daa7..fc6cfb6 100644
--- a/website/rserve/graphs.R
+++ b/website/rserve/graphs.R
@@ -1095,3 +1095,34 @@ plot_hidserv_frac_reporting <- function(start, end, path) {
ggsave(filename = path, width = 8, height = 5, dpi = 72)
}
+plot_webstats_tb <- function(start, end, path) {
+ end <- min(end, as.character(Sys.Date() - 2))
+ load("/srv/metrics.torproject.org/metrics/shared/RData/webstats-tb.RData")
+ d <- data
+ d <- d[d$log_date >= start & d$log_date <= end, ]
+ date_breaks <- date_breaks(as.numeric(max(d$log_date) - min(d$log_date)))
+ d$request_type <- factor(d$request_type)
+ levels(d$request_type) <- list(
+ 'Initial downloads' = 'tbid',
+ 'Signature downloads' = 'tbsd',
+ 'Update pings' = 'tbup',
+ 'Update requests' = 'tbur')
+ formatter <- function(x, ...) {
+ format(x, ..., scientific = FALSE, big.mark = ' ') }
+ ggplot(d, aes(x = log_date, y = count)) +
+ geom_point() +
+ geom_line() +
+ expand_limits(y = 0) +
+ facet_grid(request_type ~ ., scales = "free_y") +
+ scale_x_date(name = paste("\nThe Tor Project - ",
+ "https://metrics.torproject.org/", sep = ""),
+ labels = date_format(date_breaks$format),
+ breaks = date_breaks$major,
+ minor_breaks = date_breaks$minor) +
+ scale_y_continuous(name = 'Requests per day\n', labels = formatter) +
+ theme(strip.text.y = element_text(angle = 0, hjust = 0, size = rel(1.5)),
+ strip.background = element_rect(fill = NA)) +
+ ggtitle("Tor Browser downloads and updates\n")
+ ggsave(filename = path, width = 8, height = 5, dpi = 72)
+}
+
diff --git a/website/src/org/torproject/metrics/web/research/ResearchStatsServlet.java b/website/src/org/torproject/metrics/web/research/ResearchStatsServlet.java
index 8f5c399..de5715e 100644
--- a/website/src/org/torproject/metrics/web/research/ResearchStatsServlet.java
+++ b/website/src/org/torproject/metrics/web/research/ResearchStatsServlet.java
@@ -39,6 +39,7 @@ public class ResearchStatsServlet extends HttpServlet {
this.availableStatisticsFiles.add("advbwdist");
this.availableStatisticsFiles.add("hidserv");
this.availableStatisticsFiles.add("disagreement");
+ this.availableStatisticsFiles.add("webstats");
}
@Override
diff --git a/website/web/WEB-INF/sources.jsp b/website/web/WEB-INF/sources.jsp
index 64ca85b..b36d43f 100644
--- a/website/web/WEB-INF/sources.jsp
+++ b/website/web/WEB-INF/sources.jsp
@@ -39,6 +39,12 @@
</div>
<div class="container">
+ <ul>
+ <li><a href="https://webstats.torproject.org/" target="_blank"><code>webstats.torproject.org</code></a> collects logs from <code>torproject.org</code> web servers and provides them as a stripped-down version of Apache's "combined" log format without IP addresses, log times, HTTP parameters, referers, and user agent strings.</li>
+ </ul>
+ </div>
+
+ <div class="container">
<h2>Measurement tools <a href="#measurement" name="measurement" class="anchor">#</a></h2>
<p>The following tools perform active measurements in the Tor network.</p>
<ul>
diff --git a/website/web/WEB-INF/stats.jsp b/website/web/WEB-INF/stats.jsp
index fc676ba..fbecc0f 100644
--- a/website/web/WEB-INF/stats.jsp
+++ b/website/web/WEB-INF/stats.jsp
@@ -483,8 +483,38 @@ given attribute.</li>
</ul>
- </div>
- </div>
+</div>
+
+<div class="container">
+<h2>Requests to <code>torproject.org</code> web servers <a href="#webstats" name="webstats" class="anchor">#</a></h2>
+
+<p>The following data file contains aggregate statistics on requests to <code>torproject.org</code> web servers.</p>
+
+<p><b>Download as <a href="stats/webstats.csv">CSV file</a>.</b></p>
+
+<p>The statistics file contains the following columns:</p>
+<ul>
+<li><b>log_date:</b> UTC date (YYYY-MM-DD) when requests to <code>torproject.org</code> web servers have been logged.</li>
+<li><b>request_type:</b> Request type with fixed identifiers as follows:
+<ul>
+<li><b>"tbid":</b> Tor Browser initial downloads: GET requests to all sites with resource strings <code>'%/torbrowser/%.exe'</code>, <code>'%/torbrowser/%.dmg'</code>, and <code>'%/torbrowser/%.tar.xz'</code> and response code 200.</li>
+<li><b>"tbsd":</b> Tor Browser signature downloads: GET requests to all sites with resource strings <code>'%/torbrowser/%.exe.asc'</code>, <code>'%/torbrowser/%.dmg.asc'</code>, and <code>'%/torbrowser/%.tar.xz.asc'</code> and response code 200.</li>
+<li><b>"tbup":</b> Tor Browser update pings: GET requests to all sites with resource strings <code>'%/torbrowser/update_2/%'</code> and response code 200.</li>
+<li><b>"tbur":</b> Tor Browser update requests: GET requests to all sites with resource strings <code>'%/torbrowser/%.mar'</code> and response code 302.</li>
+<li><b>"tmid":</b> Tor Messenger initial downloads: GET requests to all sites with resource strings <code>'%/tormessenger/%.exe'</code>, <code>'%/tormessenger/%.dmg'</code>, and <code>'%/tormessenger/%.tar.xz'</code> and response code 200.</li>
+<li><b>"tmup":</b> Tor Messenger update pings: GET requests to all sites with resource strings <code>'%/tormessenger/update_2/%'</code> and response code 200.</li>
+<li><b>"twhph":</b> Tor website home page hits: GET requests to sites <code>'torproject.org'</code> and <code>'www.torproject.org'</code> with resource strings <code>'/'</code> and <code>'/index%'</code> and response code 200.</li>
+<li><b>"twdph":</b> Tor website download page hits: GET requests to sites <code>'torproject.org'</code> and <code>'www.torproject.org'</code> with resource strings <code>'/download/download%'</code> and <code>'/projects/torbrowser.html%'</code> and response code 200.</li>
+</ul>
+</li>
+<li><b>platform:</b> Platform string, like <b>"w"</b> for Windows, <b>"m"</b> for macOS, or <b>"l"</b> for Linux, <b>"o"</b> for other platforms, and the empty string for all platforms.</li>
+<li><b>channel:</b> Release channel, like <b>"r"</b> for stable releases, <b>"a"</b> for alpha releases, <b>"h"</b> for hardened releases, and the empty string for all channels.</li>
+<li><b>locale:</b> Locale, like <b>"en-US"</b> for English (United States), <b>"de"</b> for German, etc., <b>"??"</b> for unrecognized locales, and the empty string for all locales.</li>
+<li><b>incremental:</b> Incremental update, with <b>"t"</b> for incremental updates, <b>"f"</b> for non-incremental (full) updates, and the empty string for all updates types.</li>
+<li><b>count:</b> Number of request for the given request type, platform, etc.</li>
+</ul>
+
+</div>
<jsp:include page="bottom.jsp"/>
_______________________________________________
tor-commits mailing list
tor-commits@xxxxxxxxxxxxxxxxxxxx
https://lists.torproject.org/cgi-bin/mailman/listinfo/tor-commits