[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]
[tor-commits] [collector/release] Add webstats module with sync and local import functionality.
commit 97e577ae73ec631ac5d7448cb9f525594baa0c8a
Author: iwakeh <iwakeh@xxxxxxxxxxxxxx>
Date: Mon Oct 9 12:23:53 2017 +0000
Add webstats module with sync and local import functionality.
Implements task-22428.
---
CHANGELOG.md | 6 +-
build.xml | 2 +-
src/main/java/org/torproject/collector/Main.java | 2 +
.../torproject/collector/conf/Configuration.java | 3 +-
.../java/org/torproject/collector/conf/Key.java | 9 +-
.../collector/persist/DescriptorPersistence.java | 2 +
.../persist/WebServerAccessLogPersistence.java | 73 ++++++++
.../torproject/collector/sync/SyncPersistence.java | 7 +
.../torproject/collector/webstats/LogFileMap.java | 115 ++++++++++++
.../torproject/collector/webstats/LogMetadata.java | 87 +++++++++
.../collector/webstats/SanitizeWeblogs.java | 198 +++++++++++++++++++++
src/main/resources/collector.properties | 20 ++-
.../collector/conf/ConfigurationTest.java | 2 +-
.../collector/cron/CollecTorMainTest.java | 1 +
.../collector/sync/SyncPersistenceTest.java | 68 +++----
.../collector/webstats/LogFileMapTest.java | 33 ++++
.../collector/webstats/LogMetadataTest.java | 82 +++++++++
...eotrichon.torproject.org_access.log_20151007.xz | Bin 0 -> 4056 bytes
...meronense.torproject.org_access.log_20170531.gz | Bin 0 -> 388 bytes
19 files changed, 671 insertions(+), 39 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2f4cd21..a0b5d1f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,8 @@
-# Changes in version 1.?.? - 201?-??-??
+# Changes in version 1.5.0 - 2018-01-31
+
+ * Major changes
+ - Update to metrics-lib 2.2.0.
+ - Add new module for processing and sanitizing Tor web server logs.
* Minor changes
- Exclude lastModifiedMillis in index.json.
diff --git a/build.xml b/build.xml
index f004f29..48f6e33 100644
--- a/build.xml
+++ b/build.xml
@@ -11,7 +11,7 @@
<property name="release.version" value="1.4.1-dev" />
<property name="project-main-class" value="org.torproject.collector.Main" />
<property name="name" value="collector"/>
- <property name="metricslibversion" value="2.1.1" />
+ <property name="metricslibversion" value="2.2.0" />
<property name="jarincludes" value="collector.properties logback.xml" />
<patternset id="runtime" >
diff --git a/src/main/java/org/torproject/collector/Main.java b/src/main/java/org/torproject/collector/Main.java
index 50cc8be..70cdbfa 100644
--- a/src/main/java/org/torproject/collector/Main.java
+++ b/src/main/java/org/torproject/collector/Main.java
@@ -14,6 +14,7 @@ import org.torproject.collector.exitlists.ExitListDownloader;
import org.torproject.collector.index.CreateIndexJson;
import org.torproject.collector.onionperf.OnionPerfDownloader;
import org.torproject.collector.relaydescs.ArchiveWriter;
+import org.torproject.collector.webstats.SanitizeWeblogs;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -51,6 +52,7 @@ public class Main {
collecTorMains.put(Key.UpdateindexActivated, CreateIndexJson.class);
collecTorMains.put(Key.RelaydescsActivated, ArchiveWriter.class);
collecTorMains.put(Key.OnionPerfActivated, OnionPerfDownloader.class);
+ collecTorMains.put(Key.WebstatsActivated, SanitizeWeblogs.class);
}
private static Configuration conf = new Configuration();
diff --git a/src/main/java/org/torproject/collector/conf/Configuration.java b/src/main/java/org/torproject/collector/conf/Configuration.java
index 57f9731..72bd5fc 100644
--- a/src/main/java/org/torproject/collector/conf/Configuration.java
+++ b/src/main/java/org/torproject/collector/conf/Configuration.java
@@ -92,7 +92,8 @@ public class Configuration extends Observable implements Cloneable {
|| this.getBool(Key.BridgedescsActivated)
|| this.getBool(Key.ExitlistsActivated)
|| this.getBool(Key.UpdateindexActivated)
- || this.getBool(Key.OnionPerfActivated))) {
+ || this.getBool(Key.OnionPerfActivated)
+ || this.getBool(Key.WebstatsActivated))) {
throw new ConfigurationException("Nothing is activated!\n"
+ "Please edit collector.properties. Exiting.");
}
diff --git a/src/main/java/org/torproject/collector/conf/Key.java b/src/main/java/org/torproject/collector/conf/Key.java
index e0a20a7..6454009 100644
--- a/src/main/java/org/torproject/collector/conf/Key.java
+++ b/src/main/java/org/torproject/collector/conf/Key.java
@@ -28,6 +28,7 @@ public enum Key {
BridgeSources(SourceType[].class),
ExitlistSources(SourceType[].class),
OnionPerfSources(SourceType[].class),
+ WebstatsSources(SourceType[].class),
RelayCacheOrigins(String[].class),
RelayLocalOrigins(Path.class),
RelaySyncOrigins(URL[].class),
@@ -35,6 +36,8 @@ public enum Key {
BridgeLocalOrigins(Path.class),
ExitlistSyncOrigins(URL[].class),
OnionPerfSyncOrigins(URL[].class),
+ WebstatsSyncOrigins(URL[].class),
+ WebstatsLocalOrigins(Path.class),
BridgedescsActivated(Boolean.class),
BridgedescsOffsetMinutes(Integer.class),
BridgedescsPeriodMinutes(Integer.class),
@@ -58,7 +61,11 @@ public enum Key {
KeepDirectoryArchiveImportHistory(Boolean.class),
ReplaceIpAddressesWithHashes(Boolean.class),
BridgeDescriptorMappingsLimit(Integer.class),
- OnionPerfHosts(URL[].class);
+ OnionPerfHosts(URL[].class),
+ WebstatsActivated(Boolean.class),
+ WebstatsLimits(Boolean.class),
+ WebstatsOffsetMinutes(Integer.class),
+ WebstatsPeriodMinutes(Integer.class);
private Class clazz;
private static Set<String> keys;
diff --git a/src/main/java/org/torproject/collector/persist/DescriptorPersistence.java b/src/main/java/org/torproject/collector/persist/DescriptorPersistence.java
index 3e464fe..01c9fad 100644
--- a/src/main/java/org/torproject/collector/persist/DescriptorPersistence.java
+++ b/src/main/java/org/torproject/collector/persist/DescriptorPersistence.java
@@ -19,6 +19,7 @@ public abstract class DescriptorPersistence<T extends Descriptor> {
protected static final String BRIDGEDESCS = "bridge-descriptors";
protected static final String DASH = "-";
+ protected static final String DOT = ".";
protected static final String MICRODESC = "microdesc";
protected static final String MICRODESCS = "microdescs";
protected static final String RELAYDESCS = "relay-descriptors";
@@ -26,6 +27,7 @@ public abstract class DescriptorPersistence<T extends Descriptor> {
protected static final String EXTRA_INFOS = "extra-infos";
protected static final String SERVERDESC = "server-descriptor";
protected static final String SERVERDESCS = "server-descriptors";
+ protected static final String WEBSTATS = "webstats";
protected final T desc;
protected final byte[] annotation;
diff --git a/src/main/java/org/torproject/collector/persist/WebServerAccessLogPersistence.java b/src/main/java/org/torproject/collector/persist/WebServerAccessLogPersistence.java
new file mode 100644
index 0000000..792d3a9
--- /dev/null
+++ b/src/main/java/org/torproject/collector/persist/WebServerAccessLogPersistence.java
@@ -0,0 +1,73 @@
+/* Copyright 2016--2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.collector.persist;
+
+import org.torproject.descriptor.WebServerAccessLog;
+import org.torproject.descriptor.internal.FileType;
+import org.torproject.descriptor.log.InternalLogDescriptor;
+import org.torproject.descriptor.log.InternalWebServerAccessLog;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
+import java.time.format.DateTimeFormatter;
+
+public class WebServerAccessLogPersistence
+ extends DescriptorPersistence<WebServerAccessLog> {
+
+ public static final String SEP = InternalWebServerAccessLog.SEP;
+ public static final FileType COMPRESSION = FileType.XZ;
+ private static final Logger log
+ = LoggerFactory.getLogger(WebServerAccessLogPersistence.class);
+
+ private DateTimeFormatter yearPattern = DateTimeFormatter.ofPattern("yyyy");
+ private DateTimeFormatter monthPattern = DateTimeFormatter.ofPattern("MM");
+ private DateTimeFormatter dayPattern = DateTimeFormatter.ofPattern("dd");
+
+ /** Prepare storing the given descriptor. */
+ public WebServerAccessLogPersistence(WebServerAccessLog desc) {
+ super(desc, new byte[0]);
+ byte[] compressedBytes = null;
+ try { // The descriptor bytes have to be stored compressed.
+ compressedBytes = COMPRESSION.compress(desc.getRawDescriptorBytes());
+ ((InternalLogDescriptor)desc).setRawDescriptorBytes(compressedBytes);
+ } catch (Exception ex) {
+ log.warn("Cannot compress â??{}â??. Storing uncompressed.", ex);
+ }
+ calculatePaths();
+ }
+
+ private void calculatePaths() {
+ String name =
+ this.desc.getVirtualHost() + SEP + this.desc.getPhysicalHost()
+ + SEP + "access.log"
+ + SEP + this.desc.getLogDate().format(DateTimeFormatter.BASIC_ISO_DATE)
+ + DOT + COMPRESSION.name().toLowerCase();
+ this.recentPath = Paths.get(WEBSTATS, name).toString();
+ this.storagePath = Paths.get(
+ WEBSTATS,
+ this.desc.getVirtualHost(),
+ this.desc.getLogDate().format(yearPattern), // year
+ this.desc.getLogDate().format(monthPattern), // month
+ this.desc.getLogDate().format(dayPattern), // day
+ name).toString();
+ }
+
+ /** Logs are not appended. */
+ @Override
+ public boolean storeAll(String recentRoot, String outRoot) {
+ return storeAll(recentRoot, outRoot, StandardOpenOption.CREATE_NEW,
+ StandardOpenOption.CREATE_NEW);
+ }
+
+ /** Logs are not appended. */
+ @Override
+ public boolean storeRecent(String recentRoot) {
+ return storeRecent(recentRoot, StandardOpenOption.CREATE_NEW);
+ }
+
+}
+
diff --git a/src/main/java/org/torproject/collector/sync/SyncPersistence.java b/src/main/java/org/torproject/collector/sync/SyncPersistence.java
index e230fca..142be7a 100644
--- a/src/main/java/org/torproject/collector/sync/SyncPersistence.java
+++ b/src/main/java/org/torproject/collector/sync/SyncPersistence.java
@@ -18,6 +18,7 @@ import org.torproject.collector.persist.PersistenceUtils;
import org.torproject.collector.persist.ServerDescriptorPersistence;
import org.torproject.collector.persist.StatusPersistence;
import org.torproject.collector.persist.VotePersistence;
+import org.torproject.collector.persist.WebServerAccessLogPersistence;
import org.torproject.descriptor.BridgeExtraInfoDescriptor;
import org.torproject.descriptor.BridgeNetworkStatus;
import org.torproject.descriptor.BridgeServerDescriptor;
@@ -28,6 +29,7 @@ import org.torproject.descriptor.RelayNetworkStatusConsensus;
import org.torproject.descriptor.RelayNetworkStatusVote;
import org.torproject.descriptor.RelayServerDescriptor;
import org.torproject.descriptor.TorperfResult;
+import org.torproject.descriptor.WebServerAccessLog;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -132,6 +134,10 @@ public class SyncPersistence {
case "TorperfResult":
descPersist = new OnionPerfPersistence((TorperfResult) desc);
break;
+ case "WebServerAccessLog":
+ descPersist = new WebServerAccessLogPersistence(
+ (WebServerAccessLog) desc);
+ break;
default:
log.trace("Invalid descriptor type {} for sync-merge.",
clazz.getName());
@@ -149,3 +155,4 @@ public class SyncPersistence {
}
}
}
+
diff --git a/src/main/java/org/torproject/collector/webstats/LogFileMap.java b/src/main/java/org/torproject/collector/webstats/LogFileMap.java
new file mode 100644
index 0000000..c1a6802
--- /dev/null
+++ b/src/main/java/org/torproject/collector/webstats/LogFileMap.java
@@ -0,0 +1,115 @@
+/* Copyright 2017--2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.collector.webstats;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.nio.file.FileVisitResult;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.SimpleFileVisitor;
+import java.nio.file.attribute.BasicFileAttributes;
+import java.time.LocalDate;
+import java.util.Optional;
+import java.util.TreeMap;
+import java.util.stream.Stream;
+
+/**
+ * Processes the given path and stores metadata for log files.
+ */
+public class LogFileMap
+ extends TreeMap<String, TreeMap<String, TreeMap<LocalDate, LogMetadata>>> {
+
+ private static final Logger log = LoggerFactory.getLogger(LogFileMap.class);
+
+ /**
+ * The map to keep track of the logfiles by virtual host,
+ * physical host, and date.
+ */
+ public LogFileMap(Path startDir) {
+ collectFiles(this, startDir);
+ }
+
+ private void collectFiles(final LogFileMap logFileMap, Path startDir) {
+ try {
+ Files.walkFileTree(startDir, new SimpleFileVisitor<Path>() {
+ @Override
+ public FileVisitResult visitFile(Path path, BasicFileAttributes att)
+ throws IOException {
+ Optional<LogMetadata> optionalMetadata = LogMetadata.create(path);
+ if (optionalMetadata.isPresent()) {
+ logFileMap.add(optionalMetadata.get());
+ }
+ return FileVisitResult.CONTINUE;
+ }
+
+ @Override
+ public FileVisitResult visitFileFailed(Path path, IOException ex)
+ throws IOException {
+ return logIfError(path, ex);
+ }
+
+ @Override
+ public FileVisitResult postVisitDirectory(Path path, IOException ex)
+ throws IOException {
+ return logIfError(path, ex);
+ }
+
+ private FileVisitResult logIfError(Path path, IOException ex) {
+ if (null != ex) {
+ log.warn("Cannot process '{}'.", path, ex);
+ }
+ return FileVisitResult.CONTINUE;
+ }
+ });
+ } catch (IOException ex) {
+ log.error("Cannot read directory '{}'.", startDir, ex);
+ }
+ }
+
+ /** Add log metadata to the map structure. */
+ public void add(LogMetadata metadata) {
+ TreeMap<String, TreeMap<LocalDate, LogMetadata>> virtualHosts
+ = this.get(metadata.virtualHost);
+ if (null == virtualHosts) {
+ virtualHosts = new TreeMap<String, TreeMap<LocalDate, LogMetadata>>();
+ this.put(metadata.virtualHost, virtualHosts);
+ }
+ TreeMap<LocalDate, LogMetadata> physicalHosts
+ = virtualHosts.get(metadata.physicalHost);
+ if (null == physicalHosts) {
+ physicalHosts = new TreeMap<LocalDate, LogMetadata>();
+ virtualHosts.put(metadata.physicalHost, physicalHosts);
+ }
+ physicalHosts.put(metadata.date, metadata);
+ }
+
+ /**
+ * Takes the given metadata and returns the LogMetadata for the entry
+ * of the next day.
+ */
+ public Optional<LogMetadata> nextDayLogFor(LogMetadata metadata) {
+ TreeMap<String, TreeMap<LocalDate, LogMetadata>> virtualHosts
+ = this.get(metadata.virtualHost);
+ if (null == virtualHosts) {
+ return Optional.empty();
+ }
+ TreeMap<LocalDate, LogMetadata> physicalHosts
+ = virtualHosts.get(metadata.physicalHost);
+ if (null == physicalHosts) {
+ return Optional.empty();
+ }
+ return Optional.ofNullable(physicalHosts.get(metadata.date.plusDays(1)));
+ }
+
+ /** Returns a stream of all contained log metadata. */
+ public Stream<LogMetadata> metadataStream() {
+ return this.values().stream()
+ .flatMap((virtualHosts) -> virtualHosts.values().stream())
+ .flatMap((physicalHosts) -> physicalHosts.values().stream());
+ }
+}
+
diff --git a/src/main/java/org/torproject/collector/webstats/LogMetadata.java b/src/main/java/org/torproject/collector/webstats/LogMetadata.java
new file mode 100644
index 0000000..ee0db1a
--- /dev/null
+++ b/src/main/java/org/torproject/collector/webstats/LogMetadata.java
@@ -0,0 +1,87 @@
+/* Copyright 2017--2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.collector.webstats;
+
+import static org.torproject.descriptor.log.WebServerAccessLogImpl.MARKER;
+
+import org.torproject.descriptor.internal.FileType;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.nio.file.Path;
+import java.time.LocalDate;
+import java.time.format.DateTimeFormatter;
+import java.util.Optional;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class LogMetadata {
+
+ private static final Logger log
+ = LoggerFactory.getLogger(LogMetadata.class);
+
+ /** The mandatory web server log descriptor file name pattern. */
+ public static final Pattern filenamePattern
+ = Pattern.compile("(\\S*)-" + MARKER
+ + "-(\\d{8})(?:\\.?)([a-zA-Z0-9]+)$");
+
+ /** The path of the log file to be imported. */
+ public final Path path;
+
+ /** The date the log entries were created. */
+ public final LocalDate date;
+
+ /** The log's compression type. */
+ public final FileType fileType;
+
+ /** The name of the physical host. */
+ public final String physicalHost;
+
+ /** The name of the virtual host. */
+ public final String virtualHost;
+
+ private LogMetadata(Path logPath, String physicalHost, String virtualHost,
+ LocalDate logDate, FileType fileType) {
+ this.path = logPath;
+ this.date = logDate;
+ this.fileType = fileType;
+ this.physicalHost = physicalHost;
+ this.virtualHost = virtualHost;
+ }
+
+ /**
+ * Only way to create a LogMetadata object from a given log path.
+ */
+ public static Optional<LogMetadata> create(Path logPath) {
+ LogMetadata metadata = null;
+ try {
+ Path parentPath = logPath.getName(logPath.getNameCount() - 2);
+ Path file = logPath.getFileName();
+ if (null != parentPath && null != file) {
+ String physicalHost = parentPath.toString();
+ Matcher mat = filenamePattern.matcher(file.toString());
+ if (mat.find()) {
+ String virtualHost = mat.group(1);
+ // verify date given
+ LocalDate logDate
+ = LocalDate.parse(mat.group(2), DateTimeFormatter.BASIC_ISO_DATE);
+ if (null == virtualHost || null == physicalHost || null == logDate
+ || virtualHost.isEmpty() || physicalHost.isEmpty()) {
+ log.debug("Non-matching file encountered: '{}/{}'.",
+ parentPath, file);
+ } else {
+ metadata = new LogMetadata(logPath, physicalHost, virtualHost,
+ logDate, FileType.findType(mat.group(3)));
+ }
+ }
+ }
+ } catch (Throwable ex) {
+ metadata = null;
+ log.debug("Problem parsing path '{}'.", logPath, ex);
+ }
+ return Optional.ofNullable(metadata);
+ }
+}
+
diff --git a/src/main/java/org/torproject/collector/webstats/SanitizeWeblogs.java b/src/main/java/org/torproject/collector/webstats/SanitizeWeblogs.java
new file mode 100644
index 0000000..88d62fa
--- /dev/null
+++ b/src/main/java/org/torproject/collector/webstats/SanitizeWeblogs.java
@@ -0,0 +1,198 @@
+/* Copyright 2017--2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.collector.webstats;
+
+import org.torproject.collector.conf.Configuration;
+import org.torproject.collector.conf.ConfigurationException;
+import org.torproject.collector.conf.Key;
+import org.torproject.collector.conf.SourceType;
+import org.torproject.collector.cron.CollecTorMain;
+
+import org.torproject.collector.persist.PersistenceUtils;
+import org.torproject.collector.persist.WebServerAccessLogPersistence;
+import org.torproject.descriptor.DescriptorParseException;
+import org.torproject.descriptor.WebServerAccessLog;
+import org.torproject.descriptor.log.InternalLogDescriptor;
+import org.torproject.descriptor.log.InternalWebServerAccessLog;
+import org.torproject.descriptor.log.WebServerAccessLogImpl;
+import org.torproject.descriptor.log.WebServerAccessLogLine;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.InputStreamReader;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.time.LocalDate;
+import java.time.format.DateTimeFormatter;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Set;
+import java.util.SortedSet;
+import java.util.StringJoiner;
+import java.util.TreeMap;
+import java.util.TreeSet;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+/**
+ * This module processes web-logs for CollecTor according to the weblog
+ * sanitation specification published on metrics.torproject.org</p>
+ */
+public class SanitizeWeblogs extends CollecTorMain {
+
+ private static final Logger log =
+ LoggerFactory.getLogger(SanitizeWeblogs.class);
+ private static final int LIMIT = 2;
+
+ private static final String WEBSTATS = "webstats";
+ private String outputPathName;
+ private String recentPathName;
+ private boolean limits;
+
+ /**
+ * Possibly privacy impacting data is replaced by dummy data producing a
+ * log-file (or files) that confirm(s) to Apache's Combined Log Format.
+ */
+ public SanitizeWeblogs(Configuration conf) {
+ super(conf);
+ this.mapPathDescriptors.put("recent/webstats", WebServerAccessLog.class);
+ }
+
+ @Override
+ public String module() {
+ return WEBSTATS;
+ }
+
+ @Override
+ protected String syncMarker() {
+ return "Webstats";
+ }
+
+ @Override
+ protected void startProcessing() throws ConfigurationException {
+ try {
+ Files.createDirectories(this.config.getPath(Key.OutputPath));
+ Files.createDirectories(this.config.getPath(Key.RecentPath));
+ this.outputPathName = this.config.getPath(Key.OutputPath).toString();
+ this.recentPathName = this.config.getPath(Key.RecentPath).toString();
+ this.limits = this.config.getBool(Key.WebstatsLimits);
+ Set<SourceType> sources = this.config.getSourceTypeSet(
+ Key.WebstatsSources);
+ if (sources.contains(SourceType.Local)) {
+ findCleanWrite(this.config.getPath(Key.WebstatsLocalOrigins));
+ PersistenceUtils.cleanDirectory(this.config.getPath(Key.RecentPath));
+ }
+ } catch (Exception e) {
+ log.error("Cannot sanitize web-logs: " + e.getMessage(), e);
+ throw new RuntimeException(e);
+ }
+ }
+
+ private void findCleanWrite(Path dir) {
+ LogFileMap fileMapIn = new LogFileMap(dir);
+ log.info("Found log files for {} virtual hosts.", fileMapIn.size());
+ for (Map.Entry<String,TreeMap<String,TreeMap<LocalDate,LogMetadata>>>
+ virtualEntry : fileMapIn.entrySet()) {
+ String virtualHost = virtualEntry.getKey();
+ for (Map.Entry<String, TreeMap<LocalDate, LogMetadata>> physicalEntry
+ : virtualEntry.getValue().entrySet()) {
+ String physicalHost = physicalEntry.getKey();
+ log.info("Processing logs for {} on {}.", virtualHost, physicalHost);
+ Map<LocalDate, List<WebServerAccessLogLine>> linesByDate
+ = physicalEntry.getValue().values().stream().parallel()
+ .flatMap((LogMetadata metadata)
+ -> lineStream(metadata).filter((line) -> line.isValid()))
+ .collect(Collectors.groupingBy(WebServerAccessLogLine::getDate,
+ Collectors.toList()));
+ LocalDate[] interval = determineInterval(linesByDate.keySet());
+ linesByDate.entrySet().stream()
+ .filter((entry) -> entry.getKey().isAfter(interval[0])
+ && entry.getKey().isBefore(interval[1]))
+ .forEach((entry) -> storeSanitized(virtualHost, physicalHost,
+ entry.getKey(), entry.getValue()));
+ }
+ }
+ }
+
+ private void storeSanitized(String virtualHost, String physicalHost,
+ LocalDate date, List<WebServerAccessLogLine> lines) {
+ String name = new StringJoiner(InternalLogDescriptor.SEP)
+ .add(virtualHost).add(physicalHost)
+ .add(InternalWebServerAccessLog.MARKER)
+ .add(date.format(DateTimeFormatter.BASIC_ISO_DATE)).toString();
+ log.debug("Sanitizing {}.", name);
+ List<String> retainedLines = lines
+ .stream().map((line) -> sanitize(line, date))
+ .filter((line) -> line.isPresent()).map((line) -> line.get())
+ .collect(Collectors.toList());
+ retainedLines.sort(null);
+ try {
+ WebServerAccessLogPersistence walp
+ = new WebServerAccessLogPersistence(
+ new WebServerAccessLogImpl(retainedLines, name));
+ log.debug("Storing {}.", name);
+ walp.storeOut(this.outputPathName);
+ walp.storeRecent(this.recentPathName);
+ } catch (DescriptorParseException dpe) {
+ log.error("Cannot store log desriptor {}.", name, dpe);
+ }
+ }
+
+ static Optional<String> sanitize(WebServerAccessLogLine logLine,
+ LocalDate date) {
+ if (!logLine.isValid()
+ || !("GET".equals(logLine.getMethod())
+ || "HEAD".equals(logLine.getMethod()))
+ || !logLine.getProtocol().startsWith("HTTP")
+ || 400 == logLine.getResponse() || 404 == logLine.getResponse()) {
+ return Optional.empty();
+ }
+ if (!logLine.getIp().startsWith("0.0.0.")) {
+ logLine.setIp("0.0.0.0");
+ }
+ int queryStart = logLine.getRequest().indexOf("?");
+ if (queryStart > 0) {
+ logLine.setRequest(logLine.getRequest().substring(0, queryStart));
+ }
+ return Optional.of(logLine.toLogString());
+ }
+
+ LocalDate[] determineInterval(Set<LocalDate> dates) {
+ SortedSet<LocalDate> sorted = new TreeSet<>();
+ sorted.addAll(dates);
+ if (this.limits) {
+ for (int i = 0; i < LIMIT - 1; i++) {
+ sorted.remove(sorted.last());
+ }
+ }
+ if (sorted.isEmpty()) {
+ return new LocalDate[]{LocalDate.MAX, LocalDate.MIN};
+ }
+ if (!this.limits) {
+ sorted.add(sorted.first().minusDays(1));
+ sorted.add(sorted.last().plusDays(1));
+ }
+ return new LocalDate[]{sorted.first(), sorted.last()};
+ }
+
+ private Stream<WebServerAccessLogLine> lineStream(LogMetadata metadata) {
+ log.debug("Processing file {}.", metadata.path);
+ try (BufferedReader br
+ = new BufferedReader(new InputStreamReader(new ByteArrayInputStream(
+ metadata.fileType.decompress(Files.readAllBytes(metadata.path)))))) {
+ return br.lines()
+ .map((String line) -> WebServerAccessLogLine.makeLine(line))
+ .collect(Collectors.toList()).stream();
+ } catch (Exception ex) {
+ log.debug("Skipping log-file {}.", metadata.path, ex);
+ }
+ return Stream.empty();
+ }
+
+}
+
diff --git a/src/main/resources/collector.properties b/src/main/resources/collector.properties
index 0a9f932..30dda2a 100644
--- a/src/main/resources/collector.properties
+++ b/src/main/resources/collector.properties
@@ -41,6 +41,12 @@ UpdateindexActivated = false
UpdateindexPeriodMinutes = 2
# offset in minutes since the epoch and
UpdateindexOffsetMinutes = 0
+# the following defines, if this module is activated
+WebstatsActivated = false
+# period in minutes
+WebstatsPeriodMinutes = 360
+# offset in minutes since the epoch and
+WebstatsOffsetMinutes = 31
##########################################
## All below can be changed at runtime.
#####
@@ -154,4 +160,16 @@ OnionPerfSyncOrigins = https://collector.torproject.org
## the second, etc.:
## OnionPerfHosts = http://first.torproject.org/, http://second.torproject.org/
OnionPerfHosts = https://op-us.onionperf.torproject.net/
-
+######## Tor Weblogs ########
+#
+## Define descriptor sources
+# possible values: Local, Sync
+WebstatsSources = Local
+# Retrieve files from the following CollecTor instances.
+# List of URLs separated by comma.
+WebstatsSyncOrigins = https://collector.torproject.org
+## Relative path to directory to import logfiles from.
+WebstatsLocalOrigins = in/webstats
+# Default 'true' behaves as stated in section 4 of
+# https://metrics.torproject.org/web-server-logs.html
+WebstatsLimits = true
diff --git a/src/test/java/org/torproject/collector/conf/ConfigurationTest.java b/src/test/java/org/torproject/collector/conf/ConfigurationTest.java
index dfb06b2..fcaa71f 100644
--- a/src/test/java/org/torproject/collector/conf/ConfigurationTest.java
+++ b/src/test/java/org/torproject/collector/conf/ConfigurationTest.java
@@ -40,7 +40,7 @@ public class ConfigurationTest {
public void testKeyCount() throws Exception {
assertEquals("The number of properties keys in enum Key changed."
+ "\n This test class should be adapted.",
- 45, Key.values().length);
+ 52, Key.values().length);
}
@Test()
diff --git a/src/test/java/org/torproject/collector/cron/CollecTorMainTest.java b/src/test/java/org/torproject/collector/cron/CollecTorMainTest.java
index 79c1bd7..025f96c 100644
--- a/src/test/java/org/torproject/collector/cron/CollecTorMainTest.java
+++ b/src/test/java/org/torproject/collector/cron/CollecTorMainTest.java
@@ -71,6 +71,7 @@ public class CollecTorMainTest {
case "Bridge":
case "Exitlist":
case "OnionPerf":
+ case "Webstats":
assertNotNull("Property '" + key
+ "' not specified in " + Main.CONF_FILE + ".",
props.getProperty(key));
diff --git a/src/test/java/org/torproject/collector/sync/SyncPersistenceTest.java b/src/test/java/org/torproject/collector/sync/SyncPersistenceTest.java
index 2774c8d..489a413 100644
--- a/src/test/java/org/torproject/collector/sync/SyncPersistenceTest.java
+++ b/src/test/java/org/torproject/collector/sync/SyncPersistenceTest.java
@@ -28,6 +28,7 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
+import java.util.stream.Collectors;
@RunWith(Parameterized.class)
public class SyncPersistenceTest {
@@ -49,6 +50,26 @@ public class SyncPersistenceTest {
Integer.valueOf(1),
Integer.valueOf(1)},
+ {"webstats/archive.torproject.org_"
+ + "archeotrichon.torproject.org_access.log_20151007.xz",
+ new String[]{"webstats/archive.torproject.org/2015/10/07/"
+ + "archive.torproject.org_archeotrichon.torproject.org"
+ + "_access.log_20151007.xz"},
+ "archeotrichon.torproject.org/archive.torproject.org_"
+ + "archeotrichon.torproject.org_access.log_20151007.xz",
+ Integer.valueOf(1),
+ Integer.valueOf(1)},
+
+ {"webstats/metrics.torproject.org_"
+ + "meronense.torproject.org_access.log_20170531.xz",
+ new String[]{"webstats/metrics.torproject.org/2017/05/31/"
+ + "metrics.torproject.org_meronense.torproject.org"
+ + "_access.log_20170531.xz"},
+ "meronense.torproject.org/metrics.torproject.org_"
+ + "meronense.torproject.org_access.log_20170531.gz",
+ Integer.valueOf(1),
+ Integer.valueOf(1)},
+
{"relay-descriptors/server-descriptors/"
+ "2016-10-05-19-06-17-server-descriptors",
new String[]{"relay-descriptors/server-descriptor/2016/10/e/3/"
@@ -266,6 +287,9 @@ public class SyncPersistenceTest {
@Test()
public void testRecentFileContent() throws Exception {
+ if (this.filename.contains(".log")) {
+ return; // Skip this test, because logs are compressed and sanitized.
+ }
conf = new Configuration();
makeTemporaryFolders();
DescriptorParser dp = DescriptorSourceFactory.createDescriptorParser();
@@ -292,6 +316,9 @@ public class SyncPersistenceTest {
@Test()
public void testOutFileContent() throws Exception {
+ if (this.filename.contains(".log")) {
+ return; // Skip this test, because logs are compressed and sanitized.
+ }
conf = new Configuration();
makeTemporaryFolders();
DescriptorParser dp = DescriptorSourceFactory.createDescriptorParser();
@@ -305,9 +332,8 @@ public class SyncPersistenceTest {
List<String> expContent = linesFromResource(filename);
int expContentSize = expContent.size();
for (File file : outputList) {
- List<String> content = Files.readAllLines(file.toPath(),
- StandardCharsets.UTF_8);
- for (String line : content) {
+ for (String line : Files.readAllLines(file.toPath(),
+ StandardCharsets.UTF_8)) {
assertTrue("Couldn't remove " + line + ".", expContent.remove(line));
assertEquals(--expContentSize, expContent.size());
}
@@ -325,49 +351,25 @@ public class SyncPersistenceTest {
}
private byte[] bytesFromResource() throws Exception {
- StringBuilder sb = new StringBuilder();
- BufferedReader br = new BufferedReader(new InputStreamReader(getClass()
- .getClassLoader().getResourceAsStream(filename)));
- String line = br.readLine();
- while (null != line) {
- sb.append(line).append('\n');
- line = br.readLine();
- }
- return sb.toString().getBytes();
+ return Files.readAllBytes((new File(getClass()
+ .getClassLoader().getResource(filename).toURI())).toPath());
}
private String stringFromResource() throws Exception {
- StringBuilder sb = new StringBuilder();
BufferedReader br = new BufferedReader(new InputStreamReader(getClass()
.getClassLoader().getResourceAsStream(filename)));
- String line = br.readLine();
- while (null != line) {
- sb.append(line).append('\n');
- line = br.readLine();
- }
- return sb.toString();
+ return br.lines().collect(Collectors.joining("\n", "", "\n"));
}
private String stringFromFile(File file) throws Exception {
- StringBuilder sb = new StringBuilder();
- List<String> lines = Files.readAllLines(file.toPath(),
- StandardCharsets.UTF_8);
- for (String line : lines) {
- sb.append(line).append('\n');
- }
- return sb.toString();
+ return Files.lines(file.toPath(), StandardCharsets.UTF_8)
+ .collect(Collectors.joining("\n", "", "\n"));
}
private List<String> linesFromResource(String filename) throws Exception {
- List<String> res = new ArrayList<>();
BufferedReader br = new BufferedReader(new InputStreamReader(getClass()
.getClassLoader().getResourceAsStream(filename)));
- String line = br.readLine();
- while (null != line) {
- res.add(line);
- line = br.readLine();
- }
- return res;
+ return br.lines().collect(Collectors.toList());
}
}
diff --git a/src/test/java/org/torproject/collector/webstats/LogFileMapTest.java b/src/test/java/org/torproject/collector/webstats/LogFileMapTest.java
new file mode 100644
index 0000000..d55ba40
--- /dev/null
+++ b/src/test/java/org/torproject/collector/webstats/LogFileMapTest.java
@@ -0,0 +1,33 @@
+/* Copyright 2017--2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.collector.webstats;
+
+import static org.junit.Assert.assertTrue;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.nio.file.Paths;
+import java.util.Optional;
+
+public class LogFileMapTest {
+
+ @Rule
+ public TemporaryFolder tmpf = new TemporaryFolder();
+
+ @Test
+ public void makeLogFileMap() throws Exception {
+ LogFileMap lfm = new LogFileMap(tmpf.newFolder().toPath());
+ for (String path : new String[] {"in/ph1/vh1-access.log-20170901.gz",
+ "in/ph1/vh1-access.log-20170902.xz"}) {
+ Optional<LogMetadata> element
+ = LogMetadata.create(Paths.get(path));
+ assertTrue(element.isPresent());
+ lfm.add(element.get());
+ }
+ }
+
+}
+
diff --git a/src/test/java/org/torproject/collector/webstats/LogMetadataTest.java b/src/test/java/org/torproject/collector/webstats/LogMetadataTest.java
new file mode 100644
index 0000000..6121e8d
--- /dev/null
+++ b/src/test/java/org/torproject/collector/webstats/LogMetadataTest.java
@@ -0,0 +1,82 @@
+/* Copyright 2017--2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.collector.webstats;
+
+import static org.junit.Assert.assertEquals;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.time.LocalDate;
+import java.time.format.DateTimeFormatter;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Optional;
+
+@RunWith(Parameterized.class)
+public class LogMetadataTest {
+
+ /** Path and expected values of LogMetadata. */
+ @Parameters
+ public static Collection<Object[]> pathResult() {
+ return Arrays.asList(new Object[][] {
+ {Paths.get("in", "ph1", "vh1-error.log-20170902.xz"),
+ "10001010", Boolean.FALSE,
+ "Non-access logs should be discarded."},
+ {Paths.get("in", "ph1", "vh1-access.log-2017.xz"),
+ "10001010", Boolean.FALSE,
+ "Log file should be discarded, because of wrong date format."},
+ {Paths.get("in", "ph1", "vh1-access.log.xz"),
+ "10001010", Boolean.FALSE,
+ "Log file should be discarded, because of the missing date."},
+ {Paths.get("vh1-access.log-20170901.gz"),
+ "10001010", Boolean.FALSE,
+ "Should be discarded because of missing physical host information."},
+ {Paths.get("in", "ph1", "vh1-access.log-20170901.gz"),
+ "20170901", Boolean.TRUE,
+ "Should have been accepted."},
+ {Paths.get("", "vh1-access.log-20170901.gz"),
+ "20170901", Boolean.FALSE,
+ "Should not result in metadata."},
+ {Paths.get("x", "vh1-access.log-.gz"),
+ "20170901", Boolean.FALSE,
+ "Should not result in metadata."},
+ {Paths.get("/collection/download/in/ph2", "vh2-access.log-20180901.xz"),
+ "20180901", Boolean.TRUE,
+ "Should have been accepted."}
+ });
+ }
+
+ private Path path;
+ private LocalDate date;
+ private boolean valid;
+ private String failureMessage;
+
+ /** Set all test values. */
+ public LogMetadataTest(Path path, String dateString, boolean valid,
+ String message) {
+ this.path = path;
+ this.date = LocalDate.parse(dateString, DateTimeFormatter.BASIC_ISO_DATE);
+ this.valid = valid;
+ this.failureMessage = message;
+ }
+
+ @Test
+ public void testCreate() throws Exception {
+ Optional<LogMetadata> element = LogMetadata.create(this.path);
+ assertEquals(this.failureMessage, this.valid, element.isPresent());
+ if (!this.valid) {
+ return;
+ }
+ LogMetadata lmd = element.get();
+ assertEquals(this.date, lmd.date);
+ assertEquals(this.path, lmd.path);
+ }
+
+}
+
diff --git a/src/test/resources/archeotrichon.torproject.org/archive.torproject.org_archeotrichon.torproject.org_access.log_20151007.xz b/src/test/resources/archeotrichon.torproject.org/archive.torproject.org_archeotrichon.torproject.org_access.log_20151007.xz
new file mode 100644
index 0000000..b459742
Binary files /dev/null and b/src/test/resources/archeotrichon.torproject.org/archive.torproject.org_archeotrichon.torproject.org_access.log_20151007.xz differ
diff --git a/src/test/resources/meronense.torproject.org/metrics.torproject.org_meronense.torproject.org_access.log_20170531.gz b/src/test/resources/meronense.torproject.org/metrics.torproject.org_meronense.torproject.org_access.log_20170531.gz
new file mode 100644
index 0000000..8c2333b
Binary files /dev/null and b/src/test/resources/meronense.torproject.org/metrics.torproject.org_meronense.torproject.org_access.log_20170531.gz differ
_______________________________________________
tor-commits mailing list
tor-commits@xxxxxxxxxxxxxxxxxxxx
https://lists.torproject.org/cgi-bin/mailman/listinfo/tor-commits