[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]

[or-cvs] r15478: Changed the command line. Now: -sort and truncate are separa (torflow/branches/gsoc2008/tools/BTAnalysis)



Author: fallon
Date: 2008-06-26 16:29:11 -0400 (Thu, 26 Jun 2008)
New Revision: 15478

Modified:
   torflow/branches/gsoc2008/tools/BTAnalysis/shufflebt.py
Log:
Changed the command line. Now: 
-sort and truncate are separate and optional
-graphing is optional
-we can input multiple data files. 

For example:

./shufflebt.py -n 100 -s -d shuffledir file1 file2 file3

will provide shuffled, truncated files in shuffledir and output paretoK and mean for file1,file2, and file3


Modified: torflow/branches/gsoc2008/tools/BTAnalysis/shufflebt.py
===================================================================
--- torflow/branches/gsoc2008/tools/BTAnalysis/shufflebt.py	2008-06-26 19:34:48 UTC (rev 15477)
+++ torflow/branches/gsoc2008/tools/BTAnalysis/shufflebt.py	2008-06-26 20:29:11 UTC (rev 15478)
@@ -4,8 +4,10 @@
 # Shuffles a list of  build times and produces a pdf of n of those buildtimes, 
 # which are put into 100ms blocks.
 # Requires gnuplot 4.2 and a version coreutils that provides sort -R 
-# usage: shufflebt.py -n <# circuits> -f <timefile> -d <outdirname>
-
+# "usage: shufflebt.py [-n <number of circuits>] [-s] [-g] [-k <k value>] [-d outdirname] <list of filenames>"
+# if outdir is not specified, the script will write files to the current directory
+# FIXME: you need to be in the directory containing the timefile when you call
+# this script, or supply an absolute path
 import getopt,sys,os
 import popen2
 import math,copy
@@ -15,7 +17,9 @@
     self.f = open(file)
     self.values = []
     for line in self.f:
-      self.values += [float(line[:-1])]
+      line = line.split('\t')
+      self.values += [float(line[1])]
+      
     self.f.close()
     self.buckets = {}
   def mean(self):
@@ -159,37 +163,56 @@
 
 
 def usage():
-  print "shufflebt.py -n <# circuits> -f <timefile> -d <outdirname> -k <k val.>"
+  print "usage: shufflebt.py [-n <number of circuits>] [-s] [-g] [-k <k value>] [-d outdirname] <list of filenames>"
+  sys.exit(1)
 
+def getargs():
+  # [-n <truncate to # circuits>] [-s] <list of filenames>
+  k = 3
+  sort =False
+  truncate = None
+  graph = False
+  outdirname = "." # will write to current directory if not specified
+  filenames = []
+  if len(sys.argv) < 2: usage()
+  else:
+    arglen = len(sys.argv[1:])
+    i = 0
+    while (arglen - i) > 0:
+      if sys.argv[i+1] == '-s': sort = True
+      elif sys.argv[i+1] == '-n': 
+        if not sys.argv[i + 2].isdigit(): usage()
+        truncate = sys.argv[i+2]
+        i += 1
+      elif sys.argv[i + 1] == '-g': graph = True
+      elif sys.argv[i + 1] == '-k': 
+        k = sys.argv[i + 2]
+        i += 1
+      elif sys.argv[i+1] == '-d': 
+        outdirname = sys.argv[i + 2]
+        i += 1
+      else:
+        filenames += [sys.argv[i+1]]
+      i += 1
 
-def setargs():
-  ncircuits = ""
-  dirname = ""
-  filename = ""
-  if len(sys.argv[1:]) < 4:
-    usage()
-    sys.exit(2)
-  try:
-    opts,args = getopt.getopt(sys.argv[1:],"n:f:d:k:")
-  except getopt.GetoptError,err:
-    print str(err)
-    usage()
-    sys.exit(2)
-  for o,a in opts:
-    if o == '-n': 
-      if a.isdigit():
-        ncircuits = a
-      else:
-        usage()
-    elif o == '-f': filename = a
-    elif o == '-d': dirname = a
-    elif o == '-k': k = float(a)
-    else:
-      assert False, "Bad option"
-  return ncircuits,filename,dirname,k
+
+  return sort, truncate,graph,outdirname,filenames,k
+        
+
+def shuffle(sort,truncate,filename,newfile):
+  if not sort and truncate is None: return
+  sortlocation = '/usr/local/bin/sort'
+  if sort and truncate:
+    cmd =  sortlocation + ' -R ' + filename + ' | head -n ' + truncate  + ' > ' + newfile
+  elif sort and not truncate:
+    cmd = sortlocation + ' -R ' + filename + ' > ' + newfile
+  elif not sort and truncate:
+    cmd = 'cat ' +  filename + ' | head -n ' + truncate  + ' > ' + newfile
+    
+  p = popen2.Popen4(cmd)
+  p.wait()
 if __name__ == "__main__":
-  ncircuits,filename,dirname,k = setargs()
-  print 'Num. Circuits:[',ncircuits,'] Filename:[',filename,'] Dir. Name:[',dirname,']'
+  sort, truncate,graph,dirname,filenames,k = getargs()
 
   # make new directory
   print 'Making new directory:',dirname
@@ -198,86 +221,83 @@
   else:
     print 'Dir exists, not making a new one'
 
-  # shuffle, create new file
-  print 'Shuffling...',
-
-  newfile = dirname + '/' + filename + '.' + ncircuits 
-  cmd = 'sort -R ' + filename + ' | head -n ' + ncircuits  + ' > ' + newfile
-
-  p = popen2.Popen4(cmd)
-  p.wait()
-  print 'Done'
-
-  # create histogram from file
-  print 'Calculating statistics and creating histogram...',
-  s = Stats(newfile)
-  s.makehistogram(100,newfile,newfile + '.hist')
-  mean = s.mean()
-  stddev = s.stddev()
-  median = s.median()
-  mode = s.mode()/10.0 # relies on s.makehistogram for buckets
-  parK = s.paretoK(mode)
-  modeN = s.modeN(mode)
-  modeMean = s.modeMean(mode)
-  print 'Done.'
-  print 'Mean: '+str(mean)+', mode: '+str(mode)
-  print 'ParK: '+str(parK)
-
-  # get stats
-
-  # create gnuplot file
-  print 'Creating gnuplot plot file...',
-  plotname =  newfile + '.plt'
- 
-  plotstr = "set terminal png transparent nocrop enhanced size 800,600\nset output '" + newfile + ".png'\nset style fill  solid 1.00 border -1\nset style histogram clustered gap 1 title  offset character 0, 0, 0\nset datafile missing '-'\nset style data histograms\nset title 'Buildtime Distribution Function for "+ ncircuits +" Circuits k=" + str(k) + "\nset ylabel '# Circuits'\nset xlabel 'time (in 100ms)'\n"
-  plotstr += "set label 'std dev=" + str(stddev) + "' at 170,15\n"
-
-  # FIXME: Hrmm... http://en.wikipedia.org/wiki/Skewness? Seems like a hack
-  # Or better: http://en.wikipedia.org/wiki/Gamma_distribution with k=3?
-  # Would make sense if this is the sum of 3 paretos for the individual
-  # hop distributions.
-
-  # Theta estimations 
-  maxliketheta = s.maxlikelihood(k)
-  baytheta = s.bayesian(k)
-
-  # N is the value to multipy the probabilities by
-  N = len(s.values)
-
-  #FIX? Other potential values of N: #circuits that match mode? median? mean?
-  #print 'Mean:',mean,'Median:', median,'Mode:', mode
-  #i = float("%3.0f" % int(mean * 10)) # crappy way of rounding
-  #i = int(mode * 10)
-  #N = s.buckets[i]            # num. circuits that have buildtimes 
-                              #close to mean/median/mode from histogram 
-
-#  plotstr += gamma(k,maxliketheta,N, 'maxl')
-#  plotstr += gamma(k,baytheta[0],N,'bayplus') # + stddev
-#  plotstr += gamma(k,baytheta[1],N,'bayminus') # - stddev
-
-  plotstr += pareto(parK,mode*10,modeN,'pareto')
-  plotstr += exp(modeMean*10,mode*10,modeN,'expShifted')
-
-  plotstr += "plot '" + newfile + ".hist' using 2,\\\n"
-
-  plotstr += "pareto(x) title '" + "Shifted Pareto', \\\n"
-  plotstr += "expShifted(x) title '" + "Shifted Exp' \n"
-
-
-  f = open(plotname,'w')
-  f.write(plotstr)
-  f.close()
-  print 'Done'
-
-  # plot the file
-  print 'Plotting...',
-  p = popen2.Popen4('gnuplot ' + plotname)
-#  p = popen2.Popen4('gp4.2 ' + plotname)
-
-  p.wait()
-  for err in p.fromchild:
-    print err
-  print 'Done'
-
+  for filename in filenames:
+    print 'Processing',filename
+    if truncate and sort or truncate and not sort:
+      newfile = dirname + '/' + filename + '.' + truncate + '.shuffled'
+    elif sort and not truncate:
+      newfile = dirname + '/' + filename + '.shuffled'
+    else:
+      newfile = filename 
+    # shuffle, create new file
+    shuffle(sort,truncate,filename,newfile)
   
+    # create histogram from file
+    s = Stats(newfile)
+    s.makehistogram(100,newfile,newfile + '.hist')
+    mean = s.mean()
+    stddev = s.stddev()
+    median = s.median()
+    mode = s.mode()/10.0 # relies on s.makehistogram for buckets
+    parK = s.paretoK(mode)
+    modeN = s.modeN(mode)
+    modeMean = s.modeMean(mode)
+    print 'Mean: '+str(mean)+', mode: '+str(mode)
+    print 'ParK: '+str(parK)
+  
+    # get stats
+  
+    if graph:
+      # create gnuplot file
+      ncircuits = str(len(s.values))
+      plotname =  newfile + '.plt'
+     
+      plotstr = "set terminal png transparent nocrop enhanced size 800,600\nset output '" + newfile + ".png'\nset style fill  solid 1.00 border -1\nset style histogram clustered gap 1 title  offset character 0, 0, 0\nset datafile missing '-'\nset style data histograms\nset title 'Buildtime Distribution Function for "+ ncircuits +" Circuits k=" + str(k) + "\nset ylabel '# Circuits'\nset xlabel 'time (in 100ms)'\n"
+      plotstr += "set label 'std dev=" + str(stddev) + "' at 170,15\n"
     
+      # FIXME: Hrmm... http://en.wikipedia.org/wiki/Skewness? Seems like a hack
+      # Or better: http://en.wikipedia.org/wiki/Gamma_distribution with k=3?
+      # Would make sense if this is the sum of 3 paretos for the individual
+      # hop distributions.
+    
+      # Theta estimations 
+      maxliketheta = s.maxlikelihood(k)
+      baytheta = s.bayesian(k)
+    
+      # N is the value to multipy the probabilities by
+      N = len(s.values)
+    
+      #FIX? Other potential values of N: #circuits that match mode? median? mean?
+      #print 'Mean:',mean,'Median:', median,'Mode:', mode
+      #i = float("%3.0f" % int(mean * 10)) # crappy way of rounding
+      #i = int(mode * 10)
+      #N = s.buckets[i]            # num. circuits that have buildtimes 
+                                  #close to mean/median/mode from histogram 
+    
+    #  plotstr += gamma(k,maxliketheta,N, 'maxl')
+    #  plotstr += gamma(k,baytheta[0],N,'bayplus') # + stddev
+    #  plotstr += gamma(k,baytheta[1],N,'bayminus') # - stddev
+    
+      plotstr += pareto(parK,mode*10,modeN,'pareto')
+      plotstr += exp(modeMean*10,mode*10,modeN,'expShifted')
+    
+      plotstr += "plot '" + newfile + ".hist' using 2,\\\n"
+    
+      plotstr += "pareto(x) title '" + "Shifted Pareto', \\\n"
+      plotstr += "expShifted(x) title '" + "Shifted Exp' \n"
+    
+    
+      f = open(plotname,'w')
+      f.write(plotstr)
+      f.close()
+    
+      # plot the file
+    #  p = popen2.Popen4('gnuplot ' + plotname)
+      p = popen2.Popen4('gp4.2 ' + plotname)
+    
+      p.wait()
+      for err in p.fromchild:
+        print err
+  
+    
+