#!/usr/bin/env python3 # Parses a WARC file, finds gemtext, resolves links. # ./findlinks.py file.warc import os import sys import urllib.parse def bail(msg): print(msg, file=sys.stderr) exit() # directly stolen from gemini-demo def absolutise_url(base, relative): # Absolutise relative links if "://" not in relative: # Python's URL tools somehow only work with known schemes? base = base.replace("gemini://","http://") relative = urllib.parse.urljoin(base, relative) relative = relative.replace("http://", "gemini://") return relative def checkout(headers, fp): if "Content-Type" not in headers: return if headers["Content-Type"] != "application/gemini; msgtype=response": return header = fp.readline() if not header or header[0] != ord('2'): return mime = header[3:] if not mime.startswith(b'text/gemini'): return if "WARC-Target-URI" not in headers: return uri = headers["WARC-Target-URI"] body = fp.read(int(headers["Content-Length"]) - len(header)) for line in body.split(b'\n'): if line.startswith(b'=>'): # technically speaking this is invalid as it splits on more than spaces and tabs newurl = line[2:].split()[0] print(absolutise_url(uri, newurl.decode('utf-8'))) fp = open(sys.argv[1], "rb") while line := fp.readline(): if not line.startswith(b'WARC/'): bail("no WARC/") headers = dict() while (line := fp.readline()) != b'\r\n': k, v = line.decode('utf-8').rstrip('\r\n').split(': ', 2) headers[k] = v if "Content-Length" not in headers: bail("no Content-Length") pos = fp.tell() checkout(headers, fp) fp.seek(pos + int(headers["Content-Length"])) if fp.read(4) != b'\r\n\r\n': bail("misaligned")