#!/bin/sh
# -*- scheme -*-
exec guile -e main -s "$0" "$@"
!#

;; Simple WoT crawler

(use-modules (web request)
             (web client)
             (web response)
             (web uri)
             (web http)
             (ice-9 threads)
             (ice-9 vlist)
             (ice-9 rdelim)
             (rnrs io ports)
             (ice-9 match)
             (srfi srfi-42)
             (srfi srfi-1)
             (rnrs bytevectors)
             (sxml simple)
             (sxml match))

(define base-url "http://127.0.0.1:8888")
(define seed-id "USK@QeTBVWTwBldfI-lrF~xf0nqFVDdQoSUghT~PvhyJ1NE,OjEywGD063La2H-IihD7iYtZm3rC0BP6UTvvwyF5Zh4,AQACAAE/WebOfTrust/1502")

(define (furl uri)
  (string-append base-url uri "?forcedownload=true"))

(define (furl-uri uri)
  (string-append base-url "/" uri "?forcedownload=true"))


(define (get url)
  (let* ((u (string->uri url))
         (r (build-request u))
         (p (open-socket-for-uri u))
         (rr (write-request r p))
         (rp (request-port rr)))
    (force-output p)
    (declare-opaque-header! "Location")
      ;(while (write (read-line p))
      ;       (newline))
    (let ((resp (read-response rp)))
      (let ((c (response-code resp))
            (h (response-headers resp))
            (b (read-response-body resp)))
        (cond
         ((= c 301)
          (get (furl (assoc-ref h 'location))))
          ((= c 200)
           (cond
            ((equal? '(text/html (charset . "utf-8")) (assoc-ref h 'content-type))
             (utf8->string b))
            ((equal? '(application/force-download) (assoc-ref h 'content-type))
             (utf8->string b))
            (else (assoc-ref h 'content-type))))
          (else c))))))



(define (non-breaking-sxml-reader xml-port)
  (catch #t
         (lambda () (xml->sxml xml-port))
         (lambda (key . args) (format #t "~A: ~A" key args)(newline) '())))

(define (snarf-wot-ids xml-port)
  (let ((sxml (non-breaking-sxml-reader xml-port)))
    (let ((uris '()))
      (let grab-uris ((sxml sxml))
        (match sxml
               (('Identity uri) (set! uris (cons uri uris)))
               ((a b ...)
                (map grab-uris sxml))
               (else '())))
      uris)))

(define (wot-uri-key uri)
  (let ((index (string-index uri #\/)))
    (if index 
        (string-take uri index)
        uri))) ;; no / in uri, so it is already a key.

(define (wot-uri-filename uri)
  (let ((u (if (string-prefix? "freenet:" uri)
               (substring uri 8)
               uri)))
    (string-join (string-split u #\/) "-")))

(define (dump-wot-id uri filename)
  (let ((u (if (string-prefix? "freenet:" uri)
               (substring uri 8)
               uri)))
    (format #t "Download to file ~A\n" filename)
    (if (string-prefix? "USK@" u)
        (let ((data (get (furl-uri u))))
          (if (string? data)
            (let ((port (open-output-file filename)))
              (put-string port data)
              (close-port port))
            (error (format #t "tried to save in file ~A\n" filename))))
        (error (format #t "tried to save in file ~A\n" filename)))))

(define (flatten l)
  "Flatten a nested list into a single list."
  (cond ((null? l) '())
        ((list? l) (append (flatten (car l)) (flatten (cdr l))))
        (else (list l))))

(define* (crawl-wot seed-id #:key (redownload #f))
  ;; TODO: add (flatten ...) with Guile 2.1.x (currently it gives a stack overflow)
  (let ((known '()))
    (let crawl ((seed seed-id))
      ;; save the data
      (if (catch 'misc-error
            (lambda () (let* ((filename (wot-uri-filename seed))
                              (dump (lambda () (dump-wot-id seed filename))))
                         (if (and (not redownload) (file-exists? filename))
                             (let* ((s (stat filename))
                                    (size (stat:size s)))
                               (if (= size 0)
                                   (dump)
                                   (format #t "Use local copy of file ~A (redownload ~A).\n" filename redownload)))
                             (dump))
                         #f))
            (lambda (key . args) #t))
          known
          ;; snarf all uris
          (let ((uris (call-with-input-file (wot-uri-filename seed) snarf-wot-ids)))
            ;; (write seed)(newline)
            ;; (when (not (null? uris))
            ;;  (write (car uris))(newline))
            (let ((new (list-ec (: u uris) (if (and
                                                (not (pair? u)) ; TODO: this is a hack. I do not know why u can be the full sxml. Seems to happen with IDs who do not have any trust set.
                                                (not (member (wot-uri-key u) known)))) u)))
              (when (not (null? new))
                (display "new: ")
                (write (car new))(newline))
              (when (not (null? known))
                (display "known: ")
                (write (car known))(newline)(write (length known))(newline))
              (set! known (lset-union equal?
                                      (list-ec (: u new) (wot-uri-key u))
                                      known))
              (if (null? new)
                  known
                  (lset-union equal? known (map crawl new)))))))))

(define (parse-datehint str)
  (let ((lines (string-split str #\newline)))
    `((version . ,(list-ref lines 1))
      (date . ,(list-ref lines 2)))))

(define* (datehint-for-key key year #:key (sitename "WebOfTrust") (week #f))
  (string-append "SSK" (substring key 3)
                 "/" sitename
                 "-" "DATEHINT"
                 "-" (number->string year)
                 (if week (string-append "-WEEK-" (number->string week)) "")))
  

(define (furl-key-name-version key name version)
  "Get a freenet URL for the key and the version"
  (furl-uri (string-append "SSK" (substring key 3) "/" name "-" version)))

(define (download-by-weekly-date-hint uri year week)
  (let* ((weekuri (datehint-for-key (wot-uri-key uri) year #:week week))
         (hint (get (furl-uri weekuri))))
    (if (not (string? hint))
        #f
        (let* ((hint-alist (parse-datehint hint))
               (version (assoc-ref hint-alist 'version))
               (date (assoc-ref hint-alist 'date))
               (url (furl-key-name-version (wot-uri-key uri) "WebOfTrust" version))
               (filename (string-append date "/" (wot-uri-key uri) "-" version)))
          (when (not (file-exists? date))
            (mkdir date))
          (format #t "download to: ~A | for week ~A\n" filename week)
          (let ((data (get url)))
            (when (string? data)
              (let ((port (open-output-file filename)))
                (put-string port data)
                (close-port port))))
          filename))))

(define (download-by-date-hint uri)
  "Download all versions of the ID, ordered by the week in the DATEHINT."
  ;; An uri looks like this: USK@QWW2a74OWrtN-aWJ80fjWhfFx8NlNrlU0dQfd3J7t1E,2g-wfM57Up9DV1qoEDMPcDU-KPskk0yyiYFz67ydSos,AQACAAE
  ;; A date hint for WoT looks like this: SSK@QWW2a74OWrtN-aWJ80fjWhfFx8NlNrlU0dQfd3J7t1E,2g-wfM57Up9DV1qoEDMPcDU-KPskk0yyiYFz67ydSos,AQACAAE-WebOfTrust-DATEHINT-2015
  ;; or
  ;; SSK@[key]/[sitename]-DATEHINT-[year]
  ;; SSK@[key]/[sitename]-DATEHINT-[year]-WEEK-[week]
  ;; SSK@[key]/[sitename]-DATEHINT-[year]-[month]
  ;; SSK@[key]/[sitename]-DATEHINT-[year]-[month]-[day]
  ;; see http://draketo.de/light/english/freenet/usk-and-date-hints
  ;; Approach: First check whether the ID has a date hint for each year. Then check each weak in the matching years.
  ;; download the versions into directories ordered as YEAR-month-day/SSK@...-WebOfTrust-version
  (let ((years (iota 10 2016 -1)) ; last 10 years
        (weeks (iota 52 1))) ; 1-52
    (delete #f ;; only return the filenames of successful downloads 
            (par-map (lambda (year)
                       (let* ((yearuri (datehint-for-key (wot-uri-key uri) year))
                              (hint (get (furl-uri yearuri))))
                         (if (not (string? hint))
                             #f
                             (let* ((hint-alist (parse-datehint hint))
                                    (date (assoc-ref hint-alist 'date))
                                    (month (string->number (list-ref (string-split date #\-) 2)))
                                    (min-week (* month 4))) ; avoid trying to download weeks which cannot be available.
                               (delete #f ;; only return the filenames of successful downloads 
                                       (n-par-map 52 (lambda (week)
                                                       (if (< week min-week) ; avoid weeks earlier than the date in the yearly date hint
                                                           #f
                                                           (download-by-weekly-date-hint uri year week)))
                                                  weeks))))))
                     years))))

(define (main args)
  (let ((seed-id (if (null? (cdr args))
                     seed-id
                     (car (cdr args)))))
    (let ((seed (if (string-index seed-id #\/)
                    seed-id
                    (string-append "USK" (string-drop seed-id 3) "/WebOfTrust/-1")))) ; -1 can also return 0
      ;; (crawl-wot seed))))
      ;; (write (download-by-date-hint seed)))))
      (par-map download-by-date-hint
               (crawl-wot seed)))))