changelog shortlog tags branches files raw gz bz2 help

Mercurial > hg > ventivac / changeset: choose fixed mask for rabin fingerprinting, make rabinparams output more usable for doing larger scale tests.

changeset 90: ac5d87081700
parent 89: 301697094735
child 91: e5c65c1a6221
author: Mechiel Lukkien <mechiel@ueber.net>
date: Tue, 24 Jul 2007 18:05:20 +0200
files: appl/cmd/rabinparams.b appl/cmd/testrabin.b appl/cmd/vacput.b appl/cmd/venti/put.b appl/lib/rabin.b module/rabin.m
description: choose fixed mask for rabin fingerprinting, make rabinparams output more usable for doing larger scale tests.

mask is now 1, to avoid having all zeros as content boundary. all tools have been changed accordingly.
     1.1--- a/appl/cmd/rabinparams.b	Fri Jul 20 14:23:15 2007 +0200
     1.2+++ b/appl/cmd/rabinparams.b	Tue Jul 24 18:05:20 2007 +0200
     1.3@@ -13,8 +13,8 @@
     1.4 print, sprint, fprint, fildes: import sys;
     1.5 Rcfg, Rfile: import rabin;
     1.6 
     1.7-bflag, dflag, vflag, iflag: int;
     1.8-prime, mod, width, mask, min, max: int;
     1.9+bflag, Bflag, dflag, vflag, iflag: int;
    1.10+prime, mod, width, min, max: int;
    1.11 rcfg: ref Rcfg;
    1.12 berr: ref Iobuf;
    1.13 
    1.14@@ -39,25 +39,20 @@
    1.15 	prime = 269;
    1.16 	mod = 1<<10;
    1.17 	width = 3;
    1.18-	mask = mod-1;
    1.19 	min = 1024;
    1.20 	max = 32*1024;
    1.21 
    1.22-	blocks = big 0;
    1.23-	blocksizes = array[56*1024] of {* => 0};
    1.24-	bounds = array[256] of array of (array of byte, int);
    1.25-
    1.26 	arg->init(args);
    1.27-	arg->setusage(arg->progname()+" [-biv] [-p prime] [-n width] [-m mod] [-k mask] [-s min] [-S max] dir ...");
    1.28+	arg->setusage(arg->progname()+" [-bBdiv] [-p prime] [-n width] [-m mod] [-s min] [-S max] dir ...");
    1.29 	while((c := arg->opt()) != 0)
    1.30 		case c {
    1.31 		'b' =>	bflag++;
    1.32+		'B' =>	Bflag++;
    1.33 		'd' =>	dflag++;
    1.34 		'i' =>	iflag++;
    1.35 		'p' =>	prime = int arg->earg();
    1.36 		'n' =>	width = int arg->earg();
    1.37 		'm' =>	mod = int arg->earg();
    1.38-		'k' =>	mask = int arg->earg();
    1.39 		's' =>	min = int arg->earg();
    1.40 		'S' =>	max = int arg->earg();
    1.41 		'v' =>	vflag++;
    1.42@@ -69,12 +64,16 @@
    1.43 		arg->usage();
    1.44 	rabin->debug = dflag;
    1.45 
    1.46+	blocks = big 0;
    1.47+	blocksizes = array[max] of {* => 0};
    1.48+	bounds = array[256] of array of (array of byte, int);
    1.49+
    1.50 	rerr: string;
    1.51-	(rcfg, rerr) = Rcfg.mk(prime, width, mod, mask);
    1.52+	(rcfg, rerr) = Rcfg.mk(prime, width, mod);
    1.53 	if(rerr != nil)
    1.54 		fail("rabincfg: "+rerr);
    1.55 
    1.56-	if(vflag > 1) {
    1.57+	if(bflag) {
    1.58 		berr = bufio->fopen(fildes(2), Bufio->OWRITE);
    1.59 		if(berr == nil)
    1.60 			fail(sprint("bufio open stderr: %r"));
    1.61@@ -85,26 +84,27 @@
    1.62 	if(berr != nil)
    1.63 		berr.close();
    1.64 
    1.65-	print("blocksizes:\n");
    1.66-	for(i := 0; i < len blocksizes; i++)
    1.67-		if(blocksizes[i] == 0)
    1.68-			continue;
    1.69-		else
    1.70-			print("%10d  %d\n", i, blocksizes[i]);
    1.71-	if(bflag) {
    1.72+	if(Bflag) {
    1.73+		for(i := 0; i < len blocksizes; i++)
    1.74+			if(blocksizes[i] == 0)
    1.75+				continue;
    1.76+			else
    1.77+				print("%10d  %d\n", i, blocksizes[i]);
    1.78+	}
    1.79+	if(vflag > 1) {
    1.80 		nbounds := 0;
    1.81-		for(i = 0; i < len bounds; i++) {
    1.82+		for(i := 0; i < len bounds; i++) {
    1.83 			for(j := 0; j < len bounds[i]; j++)
    1.84-				print("%-20s %d\n", fmtbound(bounds[i][j].t0), bounds[i][j].t1);
    1.85+				fprint(fildes(2), "%-20s %d\n", fmtbound(bounds[i][j].t0), bounds[i][j].t1);
    1.86 			nbounds += len bounds[i];
    1.87 		}
    1.88-		print("nbounds=%d\n", nbounds);
    1.89+		fprint(fildes(2), "nbounds=%d\n", nbounds);
    1.90 	}
    1.91 	print("blocks=%bd\n", blocks);
    1.92-	avg := 0;
    1.93+	mean := 0;
    1.94 	if(blocks != big 0)
    1.95-		avg = int (totalbytes/blocks);
    1.96-	print("average blocksize=%d\n", avg);
    1.97+		mean = int (totalbytes/blocks);
    1.98+	print("meanblocksize=%d\n", mean);
    1.99 }
   1.100 
   1.101 walk(path: string)
   1.102@@ -169,7 +169,7 @@
   1.103 	blocks++;
   1.104 	blocksizes[n]++;
   1.105 	buck := int d[0];
   1.106-	if(bflag) {
   1.107+	if(vflag > 1) {
   1.108 		for(i := 0; i < len bounds[buck]; i++) {
   1.109 			if(eq(bounds[buck][i].t0, d)) {
   1.110 				bounds[buck][i].t1++;
     2.1--- a/appl/cmd/testrabin.b	Fri Jul 20 14:23:15 2007 +0200
     2.2+++ b/appl/cmd/testrabin.b	Tue Jul 24 18:05:20 2007 +0200
     2.3@@ -35,19 +35,17 @@
     2.4 	p := 269;
     2.5 	m := 1<<13;
     2.6 	n := 3;
     2.7-	mask := m-1;
     2.8 	min := 1024;
     2.9 	max := 32*1024;
    2.10 
    2.11 	arg->init(args);
    2.12-	arg->setusage(arg->progname()+" [-p prime] [-n width] [-m mod] [-k mask] [-s min] [-S max] file");
    2.13+	arg->setusage(arg->progname()+" [-p prime] [-n width] [-m mod] [-s min] [-S max] file");
    2.14 	while((c := arg->opt()) != 0)
    2.15 		case c {
    2.16 		'd' =>	dflag++;
    2.17 		'p' =>	p = int arg->earg();
    2.18 		'n' =>	n = int arg->earg();
    2.19 		'm' =>	m = int arg->earg();
    2.20-		'k' =>	mask = int arg->earg();
    2.21 		's' =>	min = int arg->earg();
    2.22 		'S' =>	max = int arg->earg();
    2.23 		* =>	arg->usage();
    2.24@@ -58,7 +56,7 @@
    2.25 		arg->usage();
    2.26 	rabin->debug = dflag;
    2.27 
    2.28-	(rcfg, rerr) := Rcfg.mk(p, n, m, mask);
    2.29+	(rcfg, rerr) := Rcfg.mk(p, n, m);
    2.30 	if(rerr != nil)
    2.31 		fail("rabincfg: "+rerr);
    2.32 
    2.33@@ -69,6 +67,8 @@
    2.34 	if(oerr != nil)
    2.35 		fail("rfile open: "+oerr);
    2.36 
    2.37+	nblocks := 0;
    2.38+	nbytes := 0;
    2.39 	for(;;) {
    2.40 		(d, off, err) := rfile.read();
    2.41 		if(err != nil)
    2.42@@ -76,8 +76,14 @@
    2.43 		if(len d == 0)
    2.44 			break;
    2.45 		score := sha1(d);
    2.46+		nblocks++;
    2.47+		nbytes += len d;
    2.48 		print("offset=%bd, score=%s len=%d\n", off, score, len d);
    2.49 	}
    2.50+	mean := 0;
    2.51+	if(nblocks > 0)
    2.52+		mean = nbytes/nblocks;
    2.53+	print("%d blocks, %d mean block size\n", nblocks, mean);
    2.54 }
    2.55 
    2.56 sha1(a: array of byte): string
     3.1--- a/appl/cmd/vacput.b	Fri Jul 20 14:23:15 2007 +0200
     3.2+++ b/appl/cmd/vacput.b	Tue Jul 24 18:05:20 2007 +0200
     3.3@@ -58,7 +58,6 @@
     3.4 	prime := 269;
     3.5 	mod := 1<<10;
     3.6 	width := 3;
     3.7-	mask := mod-1;
     3.8 	blockmin = 1024;
     3.9 	blockmax = 32*1024;
    3.10 
    3.11@@ -90,7 +89,7 @@
    3.12 
    3.13 	if(rflag) {
    3.14 		err: string;
    3.15-		(rcfg, err) = Rcfg.mk(prime, width, mod, mask);
    3.16+		(rcfg, err) = Rcfg.mk(prime, width, mod);
    3.17 		if(err != nil)
    3.18 			error("rabincfg: "+err);
    3.19 	}
     4.1--- a/appl/cmd/venti/put.b	Fri Jul 20 14:23:15 2007 +0200
     4.2+++ b/appl/cmd/venti/put.b	Tue Jul 24 18:05:20 2007 +0200
     4.3@@ -49,7 +49,6 @@
     4.4 	prime := 269;
     4.5 	mod := 1<<10;
     4.6 	width := 3;
     4.7-	mask := mod-1;
     4.8 	blockmin := 1024;
     4.9 	blockmax := 32*1024;
    4.10 
    4.11@@ -72,7 +71,7 @@
    4.12 	rcfg: ref Rcfg;
    4.13 	if(rflag) {
    4.14 		err: string;
    4.15-		(rcfg, err) = Rcfg.mk(prime, width, mod, mask);
    4.16+		(rcfg, err) = Rcfg.mk(prime, width, mod);
    4.17 		if(err != nil)
    4.18 			error("rabincfg: "+err);
    4.19 	}
     5.1--- a/appl/lib/rabin.b	Fri Jul 20 14:23:15 2007 +0200
     5.2+++ b/appl/lib/rabin.b	Tue Jul 24 18:05:20 2007 +0200
     5.3@@ -23,9 +23,9 @@
     5.4 	return power;
     5.5 }
     5.6 
     5.7-Rcfg.mk(prime, width, mod, mask: int): (ref Rcfg, string)
     5.8+Rcfg.mk(prime, width, mod: int): (ref Rcfg, string)
     5.9 {
    5.10-	rcfg := ref Rcfg(prime, width, mod, mask, array[256] of int);
    5.11+	rcfg := ref Rcfg(prime, width, mod, array[256] of int);
    5.12 	if((mod & (mod-1)) != 0)
    5.13 		return (nil, "mod is not power of 2");
    5.14 
    5.15@@ -36,7 +36,7 @@
    5.16 		rcfg.tab[i] = (i * power) & (mod-1);
    5.17 say(sprint("tab[%d] = %d", i, rcfg.tab[i]));
    5.18 	}
    5.19-say(sprint("mod=%d prime=%d width=%d mask=%d", rcfg.mod, rcfg.prime, rcfg.width, rcfg.mask));
    5.20+say(sprint("mod=%d prime=%d width=%d", rcfg.mod, rcfg.prime, rcfg.width));
    5.21 	return (rcfg, nil);
    5.22 }
    5.23 
    5.24@@ -44,7 +44,7 @@
    5.25 open(rcfg: ref Rcfg, b: ref Iobuf, min, max: int): (ref Rfile, string)
    5.26 {
    5.27 	if(min > max)
    5.28-		return (nil, sprint("bad minx/max"));
    5.29+		return (nil, sprint("bad min/max"));
    5.30 	if(min < rcfg.width)
    5.31 		return (nil, "min < width");
    5.32 	c := ref Rfile(b, rcfg, min, max, array[max+rcfg.width] of byte, 0, 0, big 0);
    5.33@@ -59,18 +59,17 @@
    5.34 		if(ch == EOF)
    5.35 			break;
    5.36 		c.buf[c.n] = byte ch;
    5.37-say(sprint("i=%d prev=%d width=%d data[i]=%d modpower=%d prime=%d mod=%d", c.n, c.state, width, int c.buf[c.n], modpower(prime, width-1-c.n, mod), prime, mod));
    5.38-		c.state += ch*modpower(prime, width-1-c.n, mod);
    5.39+say(sprint("i=%d prev=%d width=%d data[i]=%d prime=%d mod=%d", c.n, c.state, width, int c.buf[c.n], prime, mod));
    5.40+		c.state = (prime*c.state + ch) & (mod-1);
    5.41 		c.n++;
    5.42 	}
    5.43-	c.state &= mod-1;
    5.44 say(sprint("initial state=%d", c.state));
    5.45 	return (c, nil);
    5.46 }
    5.47 
    5.48 Rfile.read(c: self ref Rfile): (array of byte, big, string)
    5.49 {
    5.50-	(prime, width, mod, mask) := (c.rcfg.prime, c.rcfg.width, c.rcfg.mod, c.rcfg.mask);
    5.51+	(prime, width, mod) := (c.rcfg.prime, c.rcfg.width, c.rcfg.mod);
    5.52 	for(;;) {
    5.53 		ch := c.b.getb();
    5.54 		if(ch == ERROR)
    5.55@@ -83,10 +82,13 @@
    5.56 			return (d, off, nil);
    5.57 		}
    5.58 		c.buf[c.n] = byte ch;
    5.59-		new := (prime*c.state + ch - c.rcfg.tab[int c.buf[c.n-width]]) & (mod-1);
    5.60+		new := (mod+prime*c.state + ch - c.rcfg.tab[int c.buf[c.n-width]]) & (mod-1);
    5.61+		blah := mod+prime*c.state + ch - c.rcfg.tab[int c.buf[c.n-width]];
    5.62+		if(blah < 0)
    5.63+			raise sprint("fail:blah=%d", blah);
    5.64 		c.n++;
    5.65 		c.state = new;
    5.66-		if(c.n-width >= c.max || (c.n-width >= c.min && (new & mask) == mask)) {
    5.67+		if(c.n-width >= c.max || (c.n-width >= c.min && new == 1)) {
    5.68 			d := array[c.n-width] of byte;
    5.69 			d[:] = c.buf[:len d];
    5.70 			off := c.off;
     6.1--- a/module/rabin.m	Fri Jul 20 14:23:15 2007 +0200
     6.2+++ b/module/rabin.m	Tue Jul 24 18:05:20 2007 +0200
     6.3@@ -8,10 +8,10 @@
     6.4 	open:	fn(rcfg: ref Rcfg, b: ref Iobuf, min, max: int): (ref Rfile, string);
     6.5 
     6.6 	Rcfg: adt {
     6.7-		prime, width, mod, mask: int;
     6.8+		prime, width, mod: int;
     6.9 		tab:	array of int;
    6.10 
    6.11-		mk:	fn(prime, width, mod, mask: int): (ref Rcfg, string);
    6.12+		mk:	fn(prime, width, mod: int): (ref Rcfg, string);
    6.13 	};
    6.14 
    6.15 	Rfile: adt {