blob: de592c9b6ad9c9c555a26bfa5cde5d246da0d88c (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
|
#!/usr/bin/env python3
# Parses a WARC file, finds gemtext, resolves links.
# ./findlinks.py file.warc
import os
import sys
import urllib.parse
def bail(msg):
print(msg, file=sys.stderr)
exit()
# directly stolen from gemini-demo
def absolutise_url(base, relative):
# Absolutise relative links
if "://" not in relative:
# Python's URL tools somehow only work with known schemes?
base = base.replace("gemini://","http://")
relative = urllib.parse.urljoin(base, relative)
relative = relative.replace("http://", "gemini://")
return relative
def checkout(headers, fp):
if "Content-Type" not in headers: return
if headers["Content-Type"] != "application/gemini; msgtype=response": return
header = fp.readline()
if not header or header[0] != ord('2'): return
mime = header[3:]
if not mime.startswith(b'text/gemini'): return
if "WARC-Target-URI" not in headers: return
uri = headers["WARC-Target-URI"]
body = fp.read(int(headers["Content-Length"]) - len(header))
for line in body.split(b'\n'):
if line.startswith(b'=>'):
# technically speaking this is invalid as it splits on more than spaces and tabs
newurl = line[2:].split()[0]
print(absolutise_url(uri, newurl.decode('utf-8')))
fp = open(sys.argv[1], "rb")
while line := fp.readline():
if not line.startswith(b'WARC/'): bail("no WARC/")
headers = dict()
while (line := fp.readline()) != b'\r\n':
k, v = line.decode('utf-8').rstrip('\r\n').split(': ', 2)
headers[k] = v
if "Content-Length" not in headers: bail("no Content-Length")
pos = fp.tell()
checkout(headers, fp)
fp.seek(pos + int(headers["Content-Length"]))
if fp.read(4) != b'\r\n\r\n': bail("misaligned")
|