(drak)
2016-02-02: Crawl all versions and save with the date from the weekly date hint. Crawl all versions and save with the date from the weekly date hint.
diff --git a/crawl-wot.scm b/crawl-wot.scm --- a/crawl-wot.scm +++ b/crawl-wot.scm @@ -31,8 +31,8 @@ (string-append base-url "/" uri "?forcedownload=true")) -(define (get uri) - (let* ((u (string->uri uri)) +(define (get url) + (let* ((u (string->uri url)) (r (build-request u)) (p (open-socket-for-uri u)) (rr (write-request r p)) @@ -45,15 +45,18 @@ (let ((c (response-code resp)) (h (response-headers resp)) (b (read-response-body resp))) - (if (= c 301) - (get (furl (assoc-ref h 'location))) - (cond - ((equal? '(text/html (charset . "utf-8")) (assoc-ref h 'content-type)) - (utf8->string b)) - ((equal? '(application/force-download) (assoc-ref h 'content-type)) - (utf8->string b)) - (else - (assoc-ref h 'content-type)))))))) + (cond + ((= c 301) + (get (furl (assoc-ref h 'location)))) + ((= c 200) + (cond + ((equal? '(text/html (charset . "utf-8")) (assoc-ref h 'content-type)) + (utf8->string b)) + ((equal? '(application/force-download) (assoc-ref h 'content-type)) + (utf8->string b)) + (else (assoc-ref h 'content-type)))) + (else c)))))) + (define (non-breaking-sxml-reader xml-port) @@ -117,12 +120,72 @@ known (append known (map crawl new)))))))) +(define (parse-datehint str) + (let ((lines (string-split str #\newline))) + `((version . ,(list-ref lines 1)) + (date . ,(list-ref lines 2))))) + +(define* (datehint-for-key key year #:key (sitename "WebOfTrust") (week #f)) + (string-append "SSK" (substring key 3) + "/" sitename + "-" "DATEHINT" + "-" (number->string year) + (if week (string-append "-WEEK-" (number->string week)) ""))) + + +(define (furl-key-name-version key name version) + "Get a freenet URL for the key and the version" + (furl-uri (string-append key "/" name "-" (number->string version)))) + +(define (download-by-date-hint uri) + "Download all versions of the ID, ordered by the week in the DATEHINT." + ;; An uri looks like this: USK@QWW2a74OWrtN-aWJ80fjWhfFx8NlNrlU0dQfd3J7t1E,2g-wfM57Up9DV1qoEDMPcDU-KPskk0yyiYFz67ydSos,AQACAAE + ;; A date hint for WoT looks like this: SSK@QWW2a74OWrtN-aWJ80fjWhfFx8NlNrlU0dQfd3J7t1E,2g-wfM57Up9DV1qoEDMPcDU-KPskk0yyiYFz67ydSos,AQACAAE-WebOfTrust-DATEHINT-2015 + ;; or + ;; SSK@[key]/[sitename]-DATEHINT-[year] + ;; SSK@[key]/[sitename]-DATEHINT-[year]-WEEK-[week] + ;; SSK@[key]/[sitename]-DATEHINT-[year]-[month] + ;; SSK@[key]/[sitename]-DATEHINT-[year]-[month]-[day] + ;; see http://draketo.de/light/english/freenet/usk-and-date-hints + ;; Approach: First check whether the ID has a date hint for each year. Then check each weak in the matching years. + ;; download the versions into directories ordered as YEAR-month-day/SSK@...-WebOfTrust-version + (let ((years (iota 10 2016 -1)) + (weeks (iota 52 52 -1))) ; 52-1 + (delete #f + (map (lambda (year) + (let* ((yearuri (datehint-for-key (wot-uri-key uri) year)) + (hint (get (furl-uri yearuri)))) + (write yearuri)(newline) + (write hint)(newline) + (if (not (string? hint)) + #f + (map (lambda (week) + (let* ((weekuri (datehint-for-key (wot-uri-key uri) year #:week week)) + (hint (get (furl-uri weekuri)))) + (if (not (string? hint)) + #f + (let* ((hint-alist (parse-datehint hint)) + (version (assoc 'version hint-alist)) + (date (assoc 'date hint-alist)) + (url (furl-key-name-version (wot-uri-key uri) "WebOfTrust" version)) + (filename (string-append date "/" uri "-" (number->string version)))) + (when (not (file-exists? date)) + (mkdir date)) + (let ((port (open-output-file filename))) + (put-string port (get url)) + (close-port port)) + filename)))) + weeks)))) + years)))) + (define (main args) (write args)(newline) (let ((seed-id (if (null? (cdr args)) seed-id (car (cdr args))))) - (dump-wot-id seed-id - (wot-uri-filename seed-id)) - (crawl-wot seed-id) + (write (download-by-date-hint seed-id)) (newline))) + ; (dump-wot-id seed-id + ; (wot-uri-filename seed-id)) + ; (let ((known-ids (crawl-wot seed-id))) + ; (newline))))