(drak)
2016-02-12: anonymization script. anonymization script.
diff --git a/anonymize-csv.scm b/anonymize-csv.scm new file mode 100755 --- /dev/null +++ b/anonymize-csv.scm @@ -0,0 +1,107 @@ +#!/bin/sh +# -*- scheme -*- +exec guile -e main -s "$0" "$@" +!# + +;; Double-Anonymize the trust.csv by replacing keys with +;; index-numbers. This prevents results from evaluations of the trust +;; graph from being applied directly to correlation attacks on Freenet +;; users. + +;; TODO: use vhashes instead of regular hash tables. + +(use-modules (ice-9 rdelim) + (ice-9 i18n) + (srfi srfi-69) ; hash tables + (srfi srfi-1) ; first, second, third + ) + + +(define (set-add table . elements) + (let add ((elements elements)) + (cond + ((null? elements) + table) + (else + (hash-table-set! table (car elements) #t) + (add (cdr elements)))))) + +(define (set-keys table) + (hash-table-keys table)) + +(define (set-size table) + (hash-table-size table)) + +(define (set->list-sorted table) + (sort-list (set-keys table) string<?)) + +(define (make-set) + (make-hash-table)) + +(define (get-ids port) + (let collect-ids ((ids (make-set))) + (let ((line (read-line port))) + (cond + ((eof-object? line) + (set->list-sorted ids)) + (else + (let* ((columns (string-split line #\;)) + (source (first columns)) + (target (second columns))) + (collect-ids (set-add ids source target)))))))) + + +(define (index-ids-fun ids) + (let ((id-to-index (make-hash-table))) + (let fill-table ((ids ids) + (index 0)) + (cond ((null? ids) + id-to-index) + (else + (hash-table-set! id-to-index (car ids) index) + (fill-table (cdr ids) + (+ 1 index))))) + (lambda (id) (hash-table-ref id-to-index id)))) + +(define (check-csv-header port) + (let ((header (read-line port)) + (required-header-lowercase "source;target")) + + (when (not (string-prefix? required-header-lowercase (string-locale-downcase header))) + (error (format #f "input file must have header '~A' (regardless of case) but has header '~A'" required-header-lowercase header))))) + +(define (index-ids-from-file port) + (check-csv-header port) + (let* ((ids (get-ids port)) + (id->index (index-ids-fun ids))) + id->index)) + + +(define (anonymize-ids id->index inport outport) + (check-csv-header inport) + (format outport "Source;Target;Weight\n") + (let anonymize ((line (read-line inport))) + (cond + ((eof-object? line) #t) + (else + (let* ((columns (string-split line #\;)) + (source (id->index (first columns))) + (target (id->index (second columns))) + (weight (third columns))) + (format outport "~A;~A;~A\n" source target weight)) + (anonymize (read-line inport)))))) + + +(define (main args) + (let ((infile (if (null? (cdr args)) + "trust.csv" + (second args))) + (outfile (if (or (null? (cdr args)) (null? (cdr (cdr args)))) + "trust-anonymized.csv" + (third args)))) + (let ((id->index (call-with-input-file infile index-ids-from-file)) + (inport (open-input-file infile)) + (outport (open-output-file outfile))) + (anonymize-ids id->index inport outport) + (close-port inport) + (close-port outport))))