diff options
-rwxr-xr-x | findlinks.py | 57 | ||||
-rwxr-xr-x[-rw-r--r--] | garcon.py | 2 |
2 files changed, 59 insertions, 0 deletions
diff --git a/findlinks.py b/findlinks.py new file mode 100755 index 0000000..de592c9 --- /dev/null +++ b/findlinks.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +# Parses a WARC file, finds gemtext, resolves links. +# ./findlinks.py file.warc + +import os +import sys +import urllib.parse + +def bail(msg): + print(msg, file=sys.stderr) + exit() + +# directly stolen from gemini-demo +def absolutise_url(base, relative): + # Absolutise relative links + if "://" not in relative: + # Python's URL tools somehow only work with known schemes? + base = base.replace("gemini://","http://") + relative = urllib.parse.urljoin(base, relative) + relative = relative.replace("http://", "gemini://") + return relative + +def checkout(headers, fp): + if "Content-Type" not in headers: return + if headers["Content-Type"] != "application/gemini; msgtype=response": return + header = fp.readline() + if not header or header[0] != ord('2'): return + mime = header[3:] + if not mime.startswith(b'text/gemini'): return + + if "WARC-Target-URI" not in headers: return + uri = headers["WARC-Target-URI"] + + body = fp.read(int(headers["Content-Length"]) - len(header)) + for line in body.split(b'\n'): + if line.startswith(b'=>'): + # technically speaking this is invalid as it splits on more than spaces and tabs + newurl = line[2:].split()[0] + print(absolutise_url(uri, newurl.decode('utf-8'))) + +fp = open(sys.argv[1], "rb") + +while line := fp.readline(): + if not line.startswith(b'WARC/'): bail("no WARC/") + + headers = dict() + while (line := fp.readline()) != b'\r\n': + k, v = line.decode('utf-8').rstrip('\r\n').split(': ', 2) + headers[k] = v + + if "Content-Length" not in headers: bail("no Content-Length") + pos = fp.tell() + + checkout(headers, fp) + + fp.seek(pos + int(headers["Content-Length"])) + if fp.read(4) != b'\r\n\r\n': bail("misaligned") diff --git a/garcon.py b/garcon.py index e10f85e..6ada364 100644..100755 --- a/garcon.py +++ b/garcon.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + from sys import stdin, stdout, stderr import asyncio import datetime |