diff options
| -rwxr-xr-x | findlinks.py | 57 | ||||
| -rwxr-xr-x[-rw-r--r--] | garcon.py | 2 | 
2 files changed, 59 insertions, 0 deletions
diff --git a/findlinks.py b/findlinks.py new file mode 100755 index 0000000..de592c9 --- /dev/null +++ b/findlinks.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +# Parses a WARC file, finds gemtext, resolves links. +# ./findlinks.py file.warc + +import os +import sys +import urllib.parse + +def bail(msg): +	print(msg, file=sys.stderr) +	exit() + +# directly stolen from gemini-demo +def absolutise_url(base, relative): +    # Absolutise relative links +    if "://" not in relative: +        # Python's URL tools somehow only work with known schemes? +        base = base.replace("gemini://","http://") +        relative = urllib.parse.urljoin(base, relative) +        relative = relative.replace("http://", "gemini://") +    return relative + +def checkout(headers, fp): +	if "Content-Type" not in headers: return +	if headers["Content-Type"] != "application/gemini; msgtype=response": return +	header = fp.readline() +	if not header or header[0] != ord('2'): return +	mime = header[3:] +	if not mime.startswith(b'text/gemini'): return + +	if "WARC-Target-URI" not in headers: return +	uri = headers["WARC-Target-URI"] + +	body = fp.read(int(headers["Content-Length"]) - len(header)) +	for line in body.split(b'\n'): +		if line.startswith(b'=>'): +			# technically speaking this is invalid as it splits on more than spaces and tabs +			newurl = line[2:].split()[0] +			print(absolutise_url(uri, newurl.decode('utf-8'))) + +fp = open(sys.argv[1], "rb") + +while line := fp.readline(): +	if not line.startswith(b'WARC/'): bail("no WARC/") + +	headers = dict() +	while (line := fp.readline()) != b'\r\n': +		k, v = line.decode('utf-8').rstrip('\r\n').split(': ', 2) +		headers[k] = v + +	if "Content-Length" not in headers: bail("no Content-Length") +	pos = fp.tell() + +	checkout(headers, fp) + +	fp.seek(pos + int(headers["Content-Length"])) +	if fp.read(4) != b'\r\n\r\n': bail("misaligned") diff --git a/garcon.py b/garcon.py index e10f85e..6ada364 100644..100755 --- a/garcon.py +++ b/garcon.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 +  from sys import stdin, stdout, stderr  import asyncio  import datetime  | 
