summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--README (renamed from README.txt)0
-rw-r--r--URLget.py40
-rw-r--r--requirements.txt2
-rwxr-xr-xyoutube.alt.py (renamed from youtube.py.old)68
-rwxr-xr-xyoutube.py2
5 files changed, 90 insertions, 22 deletions
diff --git a/README.txt b/README
index ac4778c..ac4778c 100644
--- a/README.txt
+++ b/README
diff --git a/URLget.py b/URLget.py
new file mode 100644
index 0000000..abe27a1
--- /dev/null
+++ b/URLget.py
@@ -0,0 +1,40 @@
+class URLgetException(Exception):
+    pass
+
+
+try:
+    from curl_cffi import requests
+
+    # from curl_cffi.requests.exceptions import HTTPError
+    print("using curl_cffi")
+
+    def urlget(url):
+        # probably want to impersonate "chrome", "safari" or "safari_ios"
+        # could impersonate some more specific versions too I guess
+        try:
+            r = requests.get(url, impersonate="safari_ios")
+            # print(dir(r))
+            # print(r.status_code)
+        except Exception as e:
+            raise URLgetException(e)
+        return r.status_code, r.text
+
+except ModuleNotFoundError:
+    # fallback to just dumb user-agent spoofing, it will not help, but at least it won't hurt?
+    from urllib.request import Request, urlopen
+
+    # from urllib.error import HTTPError
+    print("using urllib.request")
+
+    def urlget(url):
+        # update as needed I guess
+        ua = "Mozilla/5.0 (X11; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0"
+        # req=Request(url)
+        req = Request(url)
+        req.add_header("User-Agent", ua)
+        try:
+            r = urlopen(req)
+        except Exception as e:
+            # except HTTPError as e:
+            raise URLgetException(e)
+        return r.status, r.read().decode()
diff --git a/requirements.txt b/requirements.txt
index 8a70ebc..cb7e46d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,3 @@
 ircstates 
+#OPTIONAL: only for TLS fingerprint spoofing, used by youtube.alt.py
+curl_cffi
diff --git a/youtube.py.old b/youtube.alt.py
index b09b1d1..ca478d8 100755
--- a/youtube.py.old
+++ b/youtube.alt.py
@@ -1,9 +1,20 @@
 #!/usr/bin/env python3
 from html.parser import HTMLParser
-from urllib.request import urlopen
-from urllib.error import HTTPError
+from URLget import urlget, URLgetException
+
+# from URLget import URLgetException
+# urlget=URLget().urlget
+# print(urlget("http://ip.envs.net"))
+# print(dir(URLget))
+
 
 class YouTube:
+    # crude import, lol
+    # URLget = URLget().URLget
+    # def __init__(self):
+    # self.URLget = URLget().URLget
+    # print(URLget,URLget.URLget)
+
     video_type = ""
 
     def mesg(self, msg, t=None):
@@ -34,35 +45,44 @@ class YouTube:
 
         return r
 
-    def is_embed(str):
+    def is_embed(self, str):
         return str.startswith("https://www.youtube.com/embed/") or str.startswith(
             "https://www.youtube-nocookie.com/embed/"
         )
 
-    def is_ytmusic(str):
+    def is_ytmusic(self, str):
         return str.startswith("https://music.youtube.com/watch?v=")
 
-    def is_ytshorts(str):
+    def is_ytshorts(self, str):
         return str.startswith("https://youtube.com/shorts/") or str.startswith(
             "https://www.youtube.com/shorts/"
         )
 
-    def is_clip(str):
+    def is_clip(self, str):
         return str.startswith("https://youtube.com/clip/") or str.startswith(
             "https://www.youtube.com/clip/"
         )
 
     class parseprop(HTMLParser):
         def __init__(self):
-            #print("yt parse init")
+            # print("yt parse init")
             HTMLParser.__init__(self)
             self.itemprops_list = ["name", "duration", "uploadDate", "interactionCount"]
             self.h = {}
             if YouTube.video_type == "clip":
                 self.itemprops_list += ["description"]
                 print("it is a clip!")
+            self.title = False
+
+        def handle_data(self, data):
+            if self.title != False:
+                # print("title",data)
+                self.h.update({"html_title": data})
+                self.title = False
 
         def handle_starttag(self, tag, attrs):
+            if tag == "title":
+                self.title = True
             if (tag != "meta" and tag != "link") or (
                 (
                     [i for i in attrs if "itemprop" in i] == []
@@ -141,28 +161,33 @@ class YouTube:
         p = self.parseprop()
         # use premature optimization? it should be SLIGHTLY faster, but can sometimes fail
         data = b""
-        if self.premature_optimization:
-            url_h = urlopen(url)
-            # <body> appears on approximately line 21 or 22, so we read 24 lines to be safe (23-25 should be license comment)
-            # I tried to read byte amounts but it's hard to make sure no invalid utf8 bytes happen due to partial reads
-            for i in range(24):
-                data += url_h.readline()
-            url_h.close()
         data = data.decode()  # bytes to utf-8
         if (
             data.find('meta itemprop="duration"') == -1
             or data.find('meta itemprop="name"') == -1
-        ):  # acts as both fallback for optimization, and in case optimization's turned off
-            # just read all of the html
-            try: data = urlopen(url).read().decode()
-            except HTTPError as e:
-              irc_string = f"[\x0304Youtube\x03] \x0307ERROR:\x0308 {e} \x0315\x03"
-              ansi_string = f"[\x1b[31mYoutube\x1b[0m] \x1b[33;2mERROR:\x1b[33;1m {e} \x1b[37;2m\x1b[0m"
+        ):
+            try:
+                status, data = urlget(url)
+                if status != 200:
+                    irc_string = (
+                        f"[\x0304Youtube\x03] \x0307ERROR:\x0308 {status} \x0315\x03"
+                    )
+                    ansi_string = f"[\x1b[31mYoutube\x1b[0m] \x1b[33;2mERROR:\x1b[33;1m {status} \x1b[37;2m\x1b[0m"
+            except URLgetException as e:
+                irc_string = f"[\x0304Youtube\x03] \x0307ERROR:\x0308 {e} \x0315\x03"
+                ansi_string = f"[\x1b[31mYoutube\x1b[0m] \x1b[33;2mERROR:\x1b[33;1m {e} \x1b[37;2m\x1b[0m"
         # print(f"\x1b[31m my data is: {data}\x1b[0m")
         p.feed(data)
         if p.h == {}:
             print(ansi_string)
             return irc_string, True
+        elif p.h == {"html_title": "YouTube"}:
+            irc_string = (
+                "[\x0304Youtube\x03] \x0307ERROR:\x0308 flagged as bot \x0315\x03"
+            )
+            ansi_string = "[\x1b[31mYoutube\x1b[0m] \x1b[33;2mERROR:\x1b[33;1m flagged as bot \x1b[37;2m\x1b[0m"
+            print(ansi_string)
+            return irc_string, True
         y = p.h
         print(y)
         y.update(duration=self.fmt_dur(y["duration"]))
@@ -176,4 +201,5 @@ if __name__ == "__main__":
     import sys
 
     YouTube.premature_optimization = False
-    YouTube.yt(YouTube, sys.argv[1])
+    # YouTube.yt(YouTube, sys.argv[1])
+    YouTube().yt(sys.argv[1])
diff --git a/youtube.py b/youtube.py
index 1e60546..0d51e16 100755
--- a/youtube.py
+++ b/youtube.py
@@ -96,7 +96,7 @@ class YouTube:
             video_id = None
         try:
             playlist_id = qs["list"][0]
-            #ignore the "random mix" and "radio" lists
+            # ignore the "random mix" and "radio" lists
             if playlist_id.startswith("RD"):
                 playlist_id = None
         except KeyError: