diff options
-rw-r--r-- | README (renamed from README.txt) | 0 | ||||
-rw-r--r-- | URLget.py | 40 | ||||
-rw-r--r-- | requirements.txt | 2 | ||||
-rwxr-xr-x | youtube.alt.py (renamed from youtube.py.old) | 68 | ||||
-rwxr-xr-x | youtube.py | 2 |
5 files changed, 90 insertions, 22 deletions
diff --git a/README.txt b/README index ac4778c..ac4778c 100644 --- a/README.txt +++ b/README diff --git a/URLget.py b/URLget.py new file mode 100644 index 0000000..abe27a1 --- /dev/null +++ b/URLget.py @@ -0,0 +1,40 @@ +class URLgetException(Exception): + pass + + +try: + from curl_cffi import requests + + # from curl_cffi.requests.exceptions import HTTPError + print("using curl_cffi") + + def urlget(url): + # probably want to impersonate "chrome", "safari" or "safari_ios" + # could impersonate some more specific versions too I guess + try: + r = requests.get(url, impersonate="safari_ios") + # print(dir(r)) + # print(r.status_code) + except Exception as e: + raise URLgetException(e) + return r.status_code, r.text + +except ModuleNotFoundError: + # fallback to just dumb user-agent spoofing, it will not help, but at least it won't hurt? + from urllib.request import Request, urlopen + + # from urllib.error import HTTPError + print("using urllib.request") + + def urlget(url): + # update as needed I guess + ua = "Mozilla/5.0 (X11; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0" + # req=Request(url) + req = Request(url) + req.add_header("User-Agent", ua) + try: + r = urlopen(req) + except Exception as e: + # except HTTPError as e: + raise URLgetException(e) + return r.status, r.read().decode() diff --git a/requirements.txt b/requirements.txt index 8a70ebc..cb7e46d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,3 @@ ircstates +#OPTIONAL: only for TLS fingerprint spoofing, used by youtube.alt.py +curl_cffi diff --git a/youtube.py.old b/youtube.alt.py index b09b1d1..ca478d8 100755 --- a/youtube.py.old +++ b/youtube.alt.py @@ -1,9 +1,20 @@ #!/usr/bin/env python3 from html.parser import HTMLParser -from urllib.request import urlopen -from urllib.error import HTTPError +from URLget import urlget, URLgetException + +# from URLget import URLgetException +# urlget=URLget().urlget +# print(urlget("http://ip.envs.net")) +# print(dir(URLget)) + class YouTube: + # crude import, lol + # URLget = URLget().URLget + # def __init__(self): + # self.URLget = URLget().URLget + # print(URLget,URLget.URLget) + video_type = "" def mesg(self, msg, t=None): @@ -34,35 +45,44 @@ class YouTube: return r - def is_embed(str): + def is_embed(self, str): return str.startswith("https://www.youtube.com/embed/") or str.startswith( "https://www.youtube-nocookie.com/embed/" ) - def is_ytmusic(str): + def is_ytmusic(self, str): return str.startswith("https://music.youtube.com/watch?v=") - def is_ytshorts(str): + def is_ytshorts(self, str): return str.startswith("https://youtube.com/shorts/") or str.startswith( "https://www.youtube.com/shorts/" ) - def is_clip(str): + def is_clip(self, str): return str.startswith("https://youtube.com/clip/") or str.startswith( "https://www.youtube.com/clip/" ) class parseprop(HTMLParser): def __init__(self): - #print("yt parse init") + # print("yt parse init") HTMLParser.__init__(self) self.itemprops_list = ["name", "duration", "uploadDate", "interactionCount"] self.h = {} if YouTube.video_type == "clip": self.itemprops_list += ["description"] print("it is a clip!") + self.title = False + + def handle_data(self, data): + if self.title != False: + # print("title",data) + self.h.update({"html_title": data}) + self.title = False def handle_starttag(self, tag, attrs): + if tag == "title": + self.title = True if (tag != "meta" and tag != "link") or ( ( [i for i in attrs if "itemprop" in i] == [] @@ -141,28 +161,33 @@ class YouTube: p = self.parseprop() # use premature optimization? it should be SLIGHTLY faster, but can sometimes fail data = b"" - if self.premature_optimization: - url_h = urlopen(url) - # <body> appears on approximately line 21 or 22, so we read 24 lines to be safe (23-25 should be license comment) - # I tried to read byte amounts but it's hard to make sure no invalid utf8 bytes happen due to partial reads - for i in range(24): - data += url_h.readline() - url_h.close() data = data.decode() # bytes to utf-8 if ( data.find('meta itemprop="duration"') == -1 or data.find('meta itemprop="name"') == -1 - ): # acts as both fallback for optimization, and in case optimization's turned off - # just read all of the html - try: data = urlopen(url).read().decode() - except HTTPError as e: - irc_string = f"[\x0304Youtube\x03] \x0307ERROR:\x0308 {e} \x0315\x03" - ansi_string = f"[\x1b[31mYoutube\x1b[0m] \x1b[33;2mERROR:\x1b[33;1m {e} \x1b[37;2m\x1b[0m" + ): + try: + status, data = urlget(url) + if status != 200: + irc_string = ( + f"[\x0304Youtube\x03] \x0307ERROR:\x0308 {status} \x0315\x03" + ) + ansi_string = f"[\x1b[31mYoutube\x1b[0m] \x1b[33;2mERROR:\x1b[33;1m {status} \x1b[37;2m\x1b[0m" + except URLgetException as e: + irc_string = f"[\x0304Youtube\x03] \x0307ERROR:\x0308 {e} \x0315\x03" + ansi_string = f"[\x1b[31mYoutube\x1b[0m] \x1b[33;2mERROR:\x1b[33;1m {e} \x1b[37;2m\x1b[0m" # print(f"\x1b[31m my data is: {data}\x1b[0m") p.feed(data) if p.h == {}: print(ansi_string) return irc_string, True + elif p.h == {"html_title": "YouTube"}: + irc_string = ( + "[\x0304Youtube\x03] \x0307ERROR:\x0308 flagged as bot \x0315\x03" + ) + ansi_string = "[\x1b[31mYoutube\x1b[0m] \x1b[33;2mERROR:\x1b[33;1m flagged as bot \x1b[37;2m\x1b[0m" + print(ansi_string) + return irc_string, True y = p.h print(y) y.update(duration=self.fmt_dur(y["duration"])) @@ -176,4 +201,5 @@ if __name__ == "__main__": import sys YouTube.premature_optimization = False - YouTube.yt(YouTube, sys.argv[1]) + # YouTube.yt(YouTube, sys.argv[1]) + YouTube().yt(sys.argv[1]) diff --git a/youtube.py b/youtube.py index 1e60546..0d51e16 100755 --- a/youtube.py +++ b/youtube.py @@ -96,7 +96,7 @@ class YouTube: video_id = None try: playlist_id = qs["list"][0] - #ignore the "random mix" and "radio" lists + # ignore the "random mix" and "radio" lists if playlist_id.startswith("RD"): playlist_id = None except KeyError: |