#!/bin/sh
# -*- scheme -*-
exec guile -e main -s "$0" "$@"
!#

;; Remove duplicate entries from the csv file (these are due to
;; downloading multiple versions of the same ID).

(use-modules (ice-9 rdelim) ; for read-line
             (ice-9 i18n)
             (srfi srfi-1) ; first, second, third
             )

(define (deduplicate infile outfile)
  (let ((known (make-hash-table))
        (inport (open-input-file infile))
        (outport (open-output-file outfile)))
    ;; first copy the header
    (display (read-line inport) outport)
    (newline outport)
    (let copy-dedup ((line (read-line inport)))
      (cond
       ((eof-object? line)
        #t)
       (else
        (let* ((columns (string-split line #\;))
               (source (first columns))
               (target (second columns))
               (key (string-append source target)))
          (when (not (hash-ref known key))
            (hash-set! known key #t)
            (display line outport)
            (newline outport))
          (copy-dedup (read-line inport))))))))
            

(define (main args)
  (let ((infile (if (null? (cdr args))
                    "trust.csv"
                    (second args)))
        (outfile (if (or (null? (cdr args)) (null? (cdr (cdr args))))
                     "trust-deduplicated.csv"
                     (third args))))
    (deduplicate infile outfile)))