[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]
[or-cvs] r20961: {projects} Several minor improvements to the bridge-descriptor sanitize (projects/archives/trunk/bridge-desc-sanitizer)
Author: kloesing
Date: 2009-11-17 10:09:11 -0500 (Tue, 17 Nov 2009)
New Revision: 20961
Modified:
projects/archives/trunk/bridge-desc-sanitizer/ConvertBridgeDescs.java
projects/archives/trunk/bridge-desc-sanitizer/HOWTO
Log:
Several minor improvements to the bridge-descriptor sanitizer
- switch to MaxMind geoip database
- scrub nicknames from the family line
- keep unreferenced descriptors
- replace references to missing descriptors with all zeros
- tweak the resulting directory structure a bit
Modified: projects/archives/trunk/bridge-desc-sanitizer/ConvertBridgeDescs.java
===================================================================
--- projects/archives/trunk/bridge-desc-sanitizer/ConvertBridgeDescs.java 2009-11-16 23:48:42 UTC (rev 20960)
+++ projects/archives/trunk/bridge-desc-sanitizer/ConvertBridgeDescs.java 2009-11-17 15:09:11 UTC (rev 20961)
@@ -1,7 +1,9 @@
import java.io.*;
import java.util.*;
+import com.maxmind.geoip.LookupService;
import org.apache.commons.codec.digest.DigestUtils;
-import org.apache.commons.codec.binary.*;
+import org.apache.commons.codec.binary.Hex;
+import org.apache.commons.codec.binary.Base64;
public class ConvertBridgeDescs {
@@ -18,31 +20,26 @@
}
File inDir = new File(args[0]);
File geoipFile = new File(args[1]);
+ LookupService cl = new LookupService(geoipFile,
+ LookupService.GEOIP_MEMORY_CACHE);
+ Set<String> unresolved = new HashSet<String>();
+ unresolved.add("--");
+ unresolved.add("a1");
+ unresolved.add("a2");
+ unresolved.add("eu");
+ unresolved.add("ap");
String year = args[2];
String month = args[3];
int yearInt = Integer.parseInt(year);
int monthInt = Integer.parseInt(month);
- File outDir = new File(args[4]);
- if (!outDir.exists()) {
- outDir.mkdir();
- }
+ File outDir = new File(args[4] + File.separator
+ + "bridge-descriptors-" + year + "-" + month);
+ outDir.mkdirs();
SortedSet<File> statuses = new TreeSet<File>();
Set<File> descriptors = new HashSet<File>();
Set<File> extrainfos = new HashSet<File>();
- System.out.print("Parsing geoip.txt file... ");
- BufferedReader r = new BufferedReader(new FileReader(geoipFile));
- String line0 = null;
- SortedMap<Long, String> geoipDatabase = new TreeMap<Long, String>();
- while ((line0 = r.readLine()) != null) {
- if (!line0.startsWith("#"))
- geoipDatabase.put(Long.parseLong(line0.split(",")[0]),
- line0.substring(line0.indexOf(',') + 1));
- }
- System.out.println("Found " + geoipDatabase.size()
- + " entries (expected 100,000 +- 10,000).");
-
System.out.println("Checking files in " + inDir.getAbsolutePath()
+ "...");
Stack<File> directoriesLeftToParse = new Stack<File>();
@@ -57,10 +54,9 @@
while (!directoriesLeftToParse.isEmpty()) {
File directoryOrFile = directoriesLeftToParse.pop();
String filename = directoryOrFile.getName();
- boolean addDirectory = false;
if (directoryOrFile.isDirectory()) {
if (/* base directory */
- filename.equals("in") ||
+ filename.equals(inDir.getName()) ||
/* current month */
filename.startsWith(currentYearAndMonth) ||
/* last days of previous month */
@@ -69,7 +65,7 @@
/* first days of next month */
(filename.startsWith(nextYearAndMonth)
&& Integer.parseInt(filename.substring(19, 21)) < 6)) {
- for (File fileInDir: directoryOrFile.listFiles()) {
+ for (File fileInDir : directoryOrFile.listFiles()) {
directoriesLeftToParse.push(fileInDir);
}
}
@@ -99,7 +95,7 @@
for (String y : hex)
new File(outDir + File.separator + "extra-infos" + File.separator
+ x + File.separator + y).mkdirs();
- Set<File> writtenExtrainfos = new HashSet<File>();
+ int writtenExtrainfos = 0;
Map<String, String> extrainfoMapping = new HashMap<String, String>();
int parsed = 0;
for (File file : extrainfos) {
@@ -147,7 +143,7 @@
BufferedWriter bw = new BufferedWriter(new FileWriter(out));
bw.write(scrubbedDesc);
bw.close();
- writtenExtrainfos.add(out);
+ writtenExtrainfos++;
}
} else if (line.equals("-----BEGIN SIGNATURE-----")) {
skipSignature = true;
@@ -160,16 +156,15 @@
}
br.close();
}
- System.out.println("\nWrote " + writtenExtrainfos.size()
+ System.out.println("\nWrote " + writtenExtrainfos
+ " extra-info descriptors.");
System.out.print("Parsing server descriptors");
for (String x : hex)
for (String y : hex)
- new File(outDir + File.separator + "descriptors" + File.separator
- + x + File.separator + y).mkdirs();
- Set<File> writtenDescriptors = new HashSet<File>();
- Map<File, File> referencedExtraInfos = new HashMap<File, File>();
+ new File(outDir + File.separator + "server-descriptors"
+ + File.separator + x + File.separator + y).mkdirs();
+ int writtenDescriptors = 0;
Map<String, String> descriptorMapping = new HashMap<String, String>();
int found = 0, notfound = 0;
parsed = 0;
@@ -189,19 +184,10 @@
continue;
} else if (line.startsWith("router ")) {
original = new StringBuilder(line + "\n");
- country = "zz";
- String[] ipParts = line.split(" ")[2].replace('.', ' ').split(" ");
- long ipNum = Long.parseLong(ipParts[0]) * 256L * 256L * 256L
- + Long.parseLong(ipParts[1]) * 256L * 256L
- + Long.parseLong(ipParts[2]) * 256L
- + Long.parseLong(ipParts[3]);
- long intervalStart = -1;
- if (ipNum >= geoipDatabase.firstKey()) {
- intervalStart = geoipDatabase.subMap(0L, ipNum).lastKey();
- String dbContent = geoipDatabase.get(intervalStart);
- long intervalEnd = Long.parseLong(dbContent.split(",")[0]);
- if (ipNum <= intervalEnd)
- country = dbContent.split(",")[1].toLowerCase();
+ country = cl.getCountry(line.split(" ")[2]).getCode().
+ toLowerCase();
+ if (unresolved.contains(country)) {
+ country = "zz";
}
scrubbed = new StringBuilder("router Unnamed 127.0.0.1 "
+ line.split(" ")[3] + " " + line.split(" ")[4] + " "
@@ -238,40 +224,30 @@
}
descriptorMapping.put(originalHash, scrubbedHash);
if (haveExtraInfo != null) {
- File out = new File(outDir + File.separator + "descriptors"
- + File.separator + scrubbedHash.charAt(0) + File.separator
+ File out = new File(outDir + File.separator
+ + "server-descriptors" + File.separator
+ + scrubbedHash.charAt(0) + File.separator
+ scrubbedHash.charAt(1) + File.separator + scrubbedHash);
if (!out.exists()) {
BufferedWriter bw2 = new BufferedWriter(new FileWriter(out));
bw2.write(scrubbedDesc);
bw2.close();
- writtenDescriptors.add(out);
- String extraInfoHash = haveExtraInfo.toLowerCase();
- File extrainfoFile = new File(outDir + File.separator
- + "extra-infos" + File.separator
- + extraInfoHash.charAt(0) + File.separator
- + extraInfoHash.charAt(1) + File.separator
- + extraInfoHash);
- if (!extrainfoFile.exists()) {
- System.out.println("Extra-info descriptor '"
- + extrainfoFile + "' does not exist.");
- System.exit(1);
- }
- referencedExtraInfos.put(out, extrainfoFile);
+ writtenDescriptors++;
}
}
} else if (line.startsWith("opt extra-info-digest ")) {
String originalExtraInfo = line.split(" ")[2].toLowerCase();
if (!extrainfoMapping.containsKey(originalExtraInfo)) {
notfound++;
+ haveExtraInfo = "0000000000000000000000000000000000000000";
} else {
found++;
- original.append(line + "\n");
haveExtraInfo = extrainfoMapping.get(originalExtraInfo).
toUpperCase();
- scrubbed.append("opt extra-info-digest " + haveExtraInfo
- + "\n");
}
+ original.append(line + "\n");
+ scrubbed.append("opt extra-info-digest " + haveExtraInfo
+ + "\n");
} else if (line.startsWith("reject ")
|| line.startsWith("accept ")) {
if (!contactWritten) {
@@ -286,7 +262,6 @@
|| line.startsWith("published ")
|| line.startsWith("uptime ")
|| line.startsWith("bandwidth ")
- || line.startsWith("uptime ")
|| line.startsWith("opt hibernating ")
|| line.equals("opt hidden-service-dir")
|| line.equals("opt caches-extra-info")) {
@@ -295,11 +270,12 @@
} else if (line.startsWith("family ")) {
StringBuilder familyLine = new StringBuilder("family");
for (String s : line.substring(7).split(" ")) {
- if (s.startsWith("$"))
+ if (s.startsWith("$")) {
familyLine.append(" $" + DigestUtils.shaHex(Hex.decodeHex(
s.substring(1).toCharArray())).toUpperCase());
- else
- familyLine.append(" " + s);
+ } else {
+ familyLine.append(" Unnamed");
+ }
}
original.append(line + "\n");
scrubbed.append(familyLine.toString() + "\n");
@@ -319,14 +295,13 @@
}
br.close();
}
- System.out.println("\nWrote " + writtenDescriptors.size()
+ System.out.println("\nWrote " + writtenDescriptors
+ " bridge descriptors. While parsing, we found that we parsed "
+ found + " extra-info identifiers before, but are missing "
+ notfound + ". (The number of missing identifiers should be "
+ "significantly smaller.)");
System.out.print("Parsing network statuses");
- Set<File> referencedDescriptors = new HashSet<File>();
parsed = notfound = found = 0;
for (File file : statuses) {
if (parsed++ > statuses.size() / days) {
@@ -340,51 +315,45 @@
BufferedReader br = new BufferedReader(new FileReader(file));
String line = null;
StringBuilder scrubbed = new StringBuilder();
- boolean addSLine = false;
while ((line = br.readLine()) != null) {
if (line.startsWith("r ")) {
String[] parts = line.split(" ");
String bridgeIdentity = parts[2] + "==";
String hexBridgeIdentity = Hex.encodeHexString(
Base64.decodeBase64(bridgeIdentity));
- String hashedBridgeIdentity2 = Base64.encodeBase64String(
- DigestUtils.sha(Base64.decodeBase64(bridgeIdentity))).
- replace("=", "");
String hashedBridgeIdentity = Base64.encodeBase64String(
DigestUtils.sha(Base64.decodeBase64(bridgeIdentity))).
substring(0, 27);
String descIdentifier = parts[3] + "==";
String hexDescIdentifier = Hex.encodeHexString(
Base64.decodeBase64(descIdentifier));
+ String replacementDescIdentifier = null;
if (!descriptorMapping.containsKey(hexDescIdentifier)) {
notfound++;
- addSLine = false;
+ replacementDescIdentifier = "AAAAAAAAAAAAAAAAAAAAAAAAAAA";
} else {
found++;
String refDesc = descriptorMapping.get(hexDescIdentifier).
toLowerCase();
File descriptorFile = new File(outDir + File.separator
- + "descriptors" + File.separator + refDesc.charAt(0)
- + File.separator + refDesc.charAt(1) + File.separator
- + refDesc);
+ + "server-descriptors" + File.separator
+ + refDesc.charAt(0) + File.separator + refDesc.charAt(1)
+ + File.separator + refDesc);
if (!descriptorFile.exists()) {
System.out.println("Descriptor file '"
+ descriptorFile.getAbsolutePath() + "' does not exist.");
+ System.exit(1);
}
- String replacementDescIdentifier = Base64.encodeBase64String(
+ replacementDescIdentifier = Base64.encodeBase64String(
Hex.decodeHex(descriptorMapping.get(hexDescIdentifier).
toCharArray())).substring(0, 27);
- scrubbed.append("r Unnamed " + hashedBridgeIdentity
- + " " + replacementDescIdentifier + " " + parts[4] + " "
- + parts[5] + " 127.0.0.1 " + parts[7] + " " + parts[8]
- + "\n");
- addSLine = true;
- referencedDescriptors.add(descriptorFile);
}
+ scrubbed.append("r Unnamed " + hashedBridgeIdentity
+ + " " + replacementDescIdentifier + " " + parts[4] + " "
+ + parts[5] + " 127.0.0.1 " + parts[7] + " " + parts[8]
+ + "\n");
} else if (line.startsWith("s ")) {
- if (addSLine) {
- scrubbed.append(line + "\n");
- }
+ scrubbed.append(line + "\n");
} else {
System.out.println("Unknown line: " + line);
System.exit(1);
@@ -395,7 +364,6 @@
String[] date = timeString.substring(0, 10).split("-");
String time = timeString.substring(11, 17);
File dir = new File(outDir + File.separator + "statuses"
- + File.separator + date[0] + File.separator + date[1]
+ File.separator + date[2] + File.separator);
dir.mkdirs();
File out = new File(dir.getAbsolutePath() + File.separator + date[0]
@@ -412,38 +380,6 @@
+ notfound + ". (The number of missing identifiers should be "
+ "significantly smaller.)");
- Set<File> deleteFromReferencedExtraInfos = new HashSet<File>();
- for (File e : referencedExtraInfos.keySet()) {
- if (!referencedDescriptors.contains(e)) {
- deleteFromReferencedExtraInfos.add(e);
- }
- }
- for (File e : deleteFromReferencedExtraInfos) {
- referencedExtraInfos.remove(e);
- }
- SortedSet<File> deleteDescriptors = new TreeSet<File>();
- for (File e : writtenDescriptors) {
- if (!referencedDescriptors.contains(e)) {
- deleteDescriptors.add(e);
- }
- }
- SortedSet<File> deleteExtraInfos = new TreeSet<File>();
- for (File e : writtenExtrainfos) {
- if (!referencedExtraInfos.values().contains(e)) {
- deleteExtraInfos.add(e);
- }
- }
- System.out.println("Deleting " + deleteDescriptors.size()
- + " unreferenced bridge descriptors and "
- + deleteExtraInfos.size() + " extra-info descriptors (keeping "
- + (writtenDescriptors.size() - deleteDescriptors.size())
- + " bridge descriptors and " + (writtenExtrainfos.size()
- - deleteExtraInfos.size()) + " extra-info descriptors).");
- for (File e : deleteDescriptors)
- e.delete();
- for (File e : deleteExtraInfos)
- e.delete();
-
long finished = System.currentTimeMillis();
System.out.println("Processing took " + ((finished - started) / 1000)
+ " seconds.");
Modified: projects/archives/trunk/bridge-desc-sanitizer/HOWTO
===================================================================
--- projects/archives/trunk/bridge-desc-sanitizer/HOWTO 2009-11-16 23:48:42 UTC (rev 20960)
+++ projects/archives/trunk/bridge-desc-sanitizer/HOWTO 2009-11-17 15:09:11 UTC (rev 20961)
@@ -54,7 +54,9 @@
If there is contact information in a descriptor, the contact line is
changed to "somebody at ...". If there is none, a contact line is added
- saying "nobody at ..." in order to put in the country code.
+ saying "nobody at ..." in order to put in the country code. If the
+ bridge's IP address cannot be resolved to a country, the unassigned
+ country code "zz" is written to the contact line.
5. Replace nickname with Unnamed
@@ -64,6 +66,14 @@
addresses. All bridge nicknames are therefore replaced with the string
Unnamed.
+6. Replace references to descriptors
+
+ Changing anything in the server descriptors or extra-info descriptors
+ invalidates the references from network statuses or server descriptors,
+ respectively. All references are replaced with the new hashes of
+ referenced descriptors, if available. In case of missing descriptors,
+ references are replaced with all zeros (or 'A's in base 64 encoding).
+
Note that these processing steps only prevent people from learning about
new bridge locations. People who already know a bridge identity or location
can easily learn more about this bridge from the sanitized descriptors.
@@ -84,30 +94,39 @@
following assumed to be commons-codec-1.4.jar) in the same directory as
this HOWTO file.
+- Download MaxMind GeoIP Java library from http://geolite.maxmind.com/
+ download/geoip/api/java/ and generate a JAR file as described in the
+ README file. Place the resulting maxmindgeoip.jar in the same directory
+ as this HOWTO file.
+
- Copy the half-hourly snapshots named from-tonga-YYYY-MM-DDThhmmssZ.tar.gz
in a directory called data/ in the same directory as this HOWTO file.
- Run ./extract-bridges.sh to extract the half-hourly snapshots in data/
to separate directories in the newly created subdirectory in/ .
-- Copy the geoip.txt from the Tor sources (from /src/config/) to the same
- directory as this HOWTO file.
+- Put the binary MaxMind GeoIP database file that shall be used for
+ resolving IP addresses to country codes in the same directory as this
+ HOWTO file. Either the free or the commercial version of the database
+ can be used. For the archives provided by The Tor Project, the first
+ available commercial version of the subsequent month is used.
- Compile the Java class using
- $ javac -cp commons-codec-1.4.jar ConvertBridgeDescs.java
+ $ javac -cp commons-codec-1.4.jar:maxmindgeoip.jar
+ ConvertBridgeDescs.java
- Run the script, providing it with the parameters it needs:
- java -cp .:commons-codec-1.4.jar ConvertBridgeDescs
- <input directory> <geoip.txt file>
- <YYYY> <MM> <output directory>
+ java -cp .:commons-codec-1.4.jar:maxmindgeoip.jar ConvertBridgeDescs
+ <input directory> <geoip database file> <YYYY> <MM>
+ <output directory>
Note that YYYY and MM specify the month that shall be processed. The other
descriptors in the input directory are ignored.
A sample invocation might be:
- $ java -cp .:commons-codec-1.4.jar ConvertBridgeDescs in/ geoip.txt
- 2008 10 out/
+ $ java -cp .:commons-codec-1.4.jar:maxmindgeoip.jar ConvertBridgeDescs
+ in/ GeoIP-106_20081101.dat 2008 10 out/