summary refs log tree commit diff
diff options
context:
space:
mode:
-rwxr-xr-xfindlinks.py57
-rwxr-xr-x[-rw-r--r--]garcon.py2
2 files changed, 59 insertions, 0 deletions
diff --git a/findlinks.py b/findlinks.py
new file mode 100755
index 0000000..de592c9
--- /dev/null
+++ b/findlinks.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+# Parses a WARC file, finds gemtext, resolves links.
+# ./findlinks.py file.warc
+
+import os
+import sys
+import urllib.parse
+
+def bail(msg):
+	print(msg, file=sys.stderr)
+	exit()
+
+# directly stolen from gemini-demo
+def absolutise_url(base, relative):
+    # Absolutise relative links
+    if "://" not in relative:
+        # Python's URL tools somehow only work with known schemes?
+        base = base.replace("gemini://","http://")
+        relative = urllib.parse.urljoin(base, relative)
+        relative = relative.replace("http://", "gemini://")
+    return relative
+
+def checkout(headers, fp):
+	if "Content-Type" not in headers: return
+	if headers["Content-Type"] != "application/gemini; msgtype=response": return
+	header = fp.readline()
+	if not header or header[0] != ord('2'): return
+	mime = header[3:]
+	if not mime.startswith(b'text/gemini'): return
+
+	if "WARC-Target-URI" not in headers: return
+	uri = headers["WARC-Target-URI"]
+
+	body = fp.read(int(headers["Content-Length"]) - len(header))
+	for line in body.split(b'\n'):
+		if line.startswith(b'=>'):
+			# technically speaking this is invalid as it splits on more than spaces and tabs
+			newurl = line[2:].split()[0]
+			print(absolutise_url(uri, newurl.decode('utf-8')))
+
+fp = open(sys.argv[1], "rb")
+
+while line := fp.readline():
+	if not line.startswith(b'WARC/'): bail("no WARC/")
+
+	headers = dict()
+	while (line := fp.readline()) != b'\r\n':
+		k, v = line.decode('utf-8').rstrip('\r\n').split(': ', 2)
+		headers[k] = v
+
+	if "Content-Length" not in headers: bail("no Content-Length")
+	pos = fp.tell()
+
+	checkout(headers, fp)
+
+	fp.seek(pos + int(headers["Content-Length"]))
+	if fp.read(4) != b'\r\n\r\n': bail("misaligned")
diff --git a/garcon.py b/garcon.py
index e10f85e..6ada364 100644..100755
--- a/garcon.py
+++ b/garcon.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 from sys import stdin, stdout, stderr
 import asyncio
 import datetime