From af6d63dbe86bf24b42442cbae5de5368fcc8fd35 Mon Sep 17 00:00:00 2001 From: dzwdz Date: Thu, 22 Feb 2024 23:24:13 +0100 Subject: findlinks --- findlinks.py | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100755 findlinks.py (limited to 'findlinks.py') diff --git a/findlinks.py b/findlinks.py new file mode 100755 index 0000000..de592c9 --- /dev/null +++ b/findlinks.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +# Parses a WARC file, finds gemtext, resolves links. +# ./findlinks.py file.warc + +import os +import sys +import urllib.parse + +def bail(msg): + print(msg, file=sys.stderr) + exit() + +# directly stolen from gemini-demo +def absolutise_url(base, relative): + # Absolutise relative links + if "://" not in relative: + # Python's URL tools somehow only work with known schemes? + base = base.replace("gemini://","http://") + relative = urllib.parse.urljoin(base, relative) + relative = relative.replace("http://", "gemini://") + return relative + +def checkout(headers, fp): + if "Content-Type" not in headers: return + if headers["Content-Type"] != "application/gemini; msgtype=response": return + header = fp.readline() + if not header or header[0] != ord('2'): return + mime = header[3:] + if not mime.startswith(b'text/gemini'): return + + if "WARC-Target-URI" not in headers: return + uri = headers["WARC-Target-URI"] + + body = fp.read(int(headers["Content-Length"]) - len(header)) + for line in body.split(b'\n'): + if line.startswith(b'=>'): + # technically speaking this is invalid as it splits on more than spaces and tabs + newurl = line[2:].split()[0] + print(absolutise_url(uri, newurl.decode('utf-8'))) + +fp = open(sys.argv[1], "rb") + +while line := fp.readline(): + if not line.startswith(b'WARC/'): bail("no WARC/") + + headers = dict() + while (line := fp.readline()) != b'\r\n': + k, v = line.decode('utf-8').rstrip('\r\n').split(': ', 2) + headers[k] = v + + if "Content-Length" not in headers: bail("no Content-Length") + pos = fp.tell() + + checkout(headers, fp) + + fp.seek(pos + int(headers["Content-Length"])) + if fp.read(4) != b'\r\n\r\n': bail("misaligned") -- cgit 1.4.1-2-gfad0