findlinks.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57

#!/usr/bin/env python3
# Parses a WARC file, finds gemtext, resolves links.
# ./findlinks.py file.warc

import os
import sys
import urllib.parse

def bail(msg):
	print(msg, file=sys.stderr)
	exit()

# directly stolen from gemini-demo
def absolutise_url(base, relative):
    # Absolutise relative links
    if "://" not in relative:
        # Python's URL tools somehow only work with known schemes?
        base = base.replace("gemini://","http://")
        relative = urllib.parse.urljoin(base, relative)
        relative = relative.replace("http://", "gemini://")
    return relative

def checkout(headers, fp):
	if "Content-Type" not in headers: return
	if headers["Content-Type"] != "application/gemini; msgtype=response": return
	header = fp.readline()
	if not header or header[0] != ord('2'): return
	mime = header[3:]
	if not mime.startswith(b'text/gemini'): return

	if "WARC-Target-URI" not in headers: return
	uri = headers["WARC-Target-URI"]

	body = fp.read(int(headers["Content-Length"]) - len(header))
	for line in body.split(b'\n'):
		if line.startswith(b'=>'):
			# technically speaking this is invalid as it splits on more than spaces and tabs
			newurl = line[2:].split()[0]
			print(absolutise_url(uri, newurl.decode('utf-8')))

fp = open(sys.argv[1], "rb")

while line := fp.readline():
	if not line.startswith(b'WARC/'): bail("no WARC/")

	headers = dict()
	while (line := fp.readline()) != b'\r\n':
		k, v = line.decode('utf-8').rstrip('\r\n').split(': ', 2)
		headers[k] = v

	if "Content-Length" not in headers: bail("no Content-Length")
	pos = fp.tell()

	checkout(headers, fp)

	fp.seek(pos + int(headers["Content-Length"]))
	if fp.read(4) != b'\r\n\r\n': bail("misaligned")