Guile Freenet
 
(drak)
2016-02-12: add full deduplication

add full deduplication

diff --git a/deduplicate-csv.scm b/deduplicate-csv.scm
new file mode 100755
--- /dev/null
+++ b/deduplicate-csv.scm
@@ -0,0 +1,42 @@
+#!/bin/sh
+# -*- scheme -*-
+exec guile -e main -s "$0" "$@"
+!#
+
+;; Remove duplicate entries from the csv file (these are due to
+;; downloading multiple versions of the same ID).
+
+(use-modules (ice-9 rdelim) ; for read-line
+             (ice-9 i18n)
+             (srfi srfi-1) ; first, second, third
+             )
+
+(define (deduplicate infile outfile)
+  (let ((known (make-hash-table))
+        (inport (open-input-file infile))
+        (outport (open-output-file outfile)))
+    ;; first copy the header
+    (display (read-line inport) outport)
+    (let copy-dedup ((line (read-line inport)))
+      (cond
+       ((eof-object? line)
+        #t)
+       (else
+        (let* ((columns (string-split line #\;))
+               (source (first columns))
+               (target (second columns))
+               (key (string-append source target)))
+          (when (not (hash-ref known key))
+            (hash-set! known key #t)
+            (display line outport))
+          (copy-dedup (read-line inport))))))))
+            
+
+(define (main args)
+  (let ((infile (if (null? (cdr args))
+                    "trust.csv"
+                    (second args)))
+        (outfile (if (or (null? (cdr args)) (null? (cdr (cdr args))))
+                     "trust-deduplicated.csv"
+                     (third args))))
+    (deduplicate infile outfile)))