(drak)
2016-01-27: first try at a wot crawler. breaks at parsing one xml file. first try at a wot crawler. breaks at parsing one xml file.
diff --git a/crawl-wot.scm b/crawl-wot.scm
new file mode 100755
--- /dev/null
+++ b/crawl-wot.scm
@@ -0,0 +1,100 @@
+#!/bin/sh
+# -*- scheme -*-
+exec guile -e main -s "$0" "$@"
+!#
+
+;; Simple WoT crawler
+
+(use-modules (web request)
+ (web client)
+ (web response)
+ (web uri)
+ (web http)
+ (ice-9 threads)
+ (ice-9 vlist)
+ (ice-9 rdelim)
+ (rnrs io ports)
+ (ice-9 match)
+ (srfi srfi-42)
+ (rnrs bytevectors)
+ (sxml simple)
+ (sxml match))
+
+(define base-url "http://127.0.0.1:8888")
+(define seed-id "USK@QeTBVWTwBldfI-lrF~xf0nqFVDdQoSUghT~PvhyJ1NE,OjEywGD063La2H-IihD7iYtZm3rC0BP6UTvvwyF5Zh4,AQACAAE/WebOfTrust/1502")
+
+(define (furl uri)
+ (string-append base-url uri "?forcedownload=true"))
+
+(define (furl-uri uri)
+ (string-append base-url "/" uri "?forcedownload=true"))
+
+
+(define (get uri)
+ (let* ((u (string->uri uri))
+ (r (build-request u))
+ (p (open-socket-for-uri u))
+ (rr (write-request r p))
+ (rp (request-port rr)))
+ (force-output p)
+ (declare-opaque-header! "Location")
+ ;(while (write (read-line p))
+ ; (newline))
+ (let ((resp (read-response rp)))
+ (let ((c (response-code resp))
+ (h (response-headers resp))
+ (b (read-response-body resp)))
+ (if (= c 301)
+ (get (furl (assoc-ref h 'location)))
+ (cond
+ ((equal? '(text/html (charset . "utf-8")) (assoc-ref h 'content-type))
+ (utf8->string b))
+ ((equal? '(application/force-download) (assoc-ref h 'content-type))
+ (utf8->string b))
+ (else
+ (assoc-ref h 'content-type))))))))
+
+
+(define (snarf-wot-ids xml-port)
+ (let ((uris '()))
+ (let grab-uris ((sxml (xml->sxml xml-port)))
+ (match sxml
+ (('Identity uri) (set! uris (cons uri uris)))
+ ((a b ...)
+ (map grab-uris sxml))
+ (else sxml)))
+ uris))
+
+(define (wot-uri-key uri)
+ (string-take uri (string-index uri #\/)))
+
+(define (wot-uri-filename uri)
+ (string-join (string-split uri #\/) "-"))
+
+(define (dump-wot-id uri filename)
+ (if (string-prefix? "USK@" uri)
+ (let ((port (open-output-file filename)))
+ (put-string port (get (furl-uri uri))))
+ (error (format #t "tried to save in file ~A" uri))))
+
+(define (crawl-wot seed-id)
+ (let ((known '()))
+ (let crawl ((seed seed-id))
+ ;; save the data
+ (dump-wot-id seed-id (wot-uri-filename seed))
+ ;; snarf all uris
+ (let* ((uris (call-with-input-file (wot-uri-filename seed) snarf-wot-ids))
+ (new (list-ec (: u uris) (if (not (member (wot-uri-key u) known))) u)))
+ (when (not (null? new))
+ (display 'new:)
+ (write (car new))(newline))
+ (when (not (null? known))
+ (display 'known:)
+ (write (car known))(newline))
+ (set! known (cons new known))
+ (map crawl new)))))
+
+(define (main args)
+ (dump-wot-id seed-id (wot-uri-filename seed-id))
+ (crawl-wot seed-id)
+ (newline))