(Arne Babenhauserheide)
2016-02-20: merge merge
diff --git a/crawl-wot.scm b/crawl-wot.scm --- a/crawl-wot.scm +++ b/crawl-wot.scm @@ -134,17 +134,17 @@ (not (pair? u)) ; TODO: this is a hack. I do not know why u can be the full sxml. Seems to happen with IDs who do not have any trust set. (not (member (wot-uri-key u) known)))) u))) (when (not (null? new)) - (display 'new:) + (display "new: ") (write (car new))(newline)) (when (not (null? known)) - (display 'known:) + (display "known: ") (write (car known))(newline)(write (length known))(newline)) (set! known (lset-union equal? (list-ec (: u new) (wot-uri-key u)) known)) (if (null? new) known - (append known (map crawl new))))))))) + (lset-union equal? known (map crawl new))))))))) (define (parse-datehint str) (let ((lines (string-split str #\newline))) @@ -175,6 +175,7 @@ (filename (string-append date "/" (wot-uri-key uri) "-" version))) (when (not (file-exists? date)) (mkdir date)) + (format #t "download to: ~A | for week ~A\n" filename week) (let ((data (get url))) (when (string? data) (let ((port (open-output-file filename))) @@ -194,18 +195,24 @@ ;; see http://draketo.de/light/english/freenet/usk-and-date-hints ;; Approach: First check whether the ID has a date hint for each year. Then check each weak in the matching years. ;; download the versions into directories ordered as YEAR-month-day/SSK@...-WebOfTrust-version - (let ((years (iota 10 2016 -1)) - (weeks (iota 52 1))) ; 52-1 + (let ((years (iota 10 2016 -1)) ; last 10 years + (weeks (iota 52 1))) ; 1-52 (delete #f ;; only return the filenames of successful downloads (par-map (lambda (year) (let* ((yearuri (datehint-for-key (wot-uri-key uri) year)) (hint (get (furl-uri yearuri)))) (if (not (string? hint)) #f - (delete #f ;; only return the filenames of successful downloads - (n-par-map 52 (lambda (week) - (download-by-weekly-date-hint uri year week)) - weeks))))) + (let* ((hint-alist (parse-datehint hint)) + (date (assoc-ref hint-alist 'date)) + (month (string->number (list-ref (string-split date #\-) 2))) + (min-week (* (- month 1) 4))) ; avoid trying to download weeks which cannot be available. + (delete #f ;; only return the filenames of successful downloads + (n-par-map 52 (lambda (week) + (if (< week min-week) ; avoid weeks earlier than the date in the yearly date hint + #f + (download-by-weekly-date-hint uri year week))) + weeks)))))) years)))) (define (main args) @@ -215,6 +222,6 @@ (let ((seed (if (string-index seed-id #\/) seed-id (string-append "USK" (string-drop seed-id 3) "/WebOfTrust/0")))) - (write (download-by-date-hint seed)) - (par-map (lambda (x) (map download-by-date-hint x)) + ;; (write (download-by-date-hint seed)) + (par-map download-by-date-hint (crawl-wot seed)))))