diff options
-rw-r--r-- | README | 8 | ||||
-rw-r--r-- | README.txt | 3 | ||||
-rw-r--r-- | URLget.py | 45 | ||||
-rw-r--r-- | _.txt | 19 | ||||
-rwxr-xr-x | applemusic.py | 8 | ||||
-rwxr-xr-x | bandcamp.py | 11 | ||||
-rw-r--r-- | commands.py | 21 | ||||
-rw-r--r-- | config.py | 4 | ||||
-rw-r--r-- | requirements.txt | 4 | ||||
-rwxr-xr-x | soundcloud.py | 4 | ||||
-rwxr-xr-x | spotify.py | 17 | ||||
-rw-r--r-- | stuff.py | 39 | ||||
-rw-r--r-- | util.py | 8 | ||||
-rwxr-xr-x | youtube_abstract.py | 215 | ||||
-rwxr-xr-x | youtube_oembed.py | 146 | ||||
-rwxr-xr-x | youtube_oembed_old.py (renamed from youtube.py) | 19 | ||||
-rwxr-xr-x | youtube_scrape.py (renamed from youtube.py.old) | 93 |
17 files changed, 528 insertions, 136 deletions
diff --git a/README b/README new file mode 100644 index 0000000..fa6f089 --- /dev/null +++ b/README @@ -0,0 +1,8 @@ +code is formatted with "black --line-length=120" +#remember to install the libraries, and you probably should use venvs anyway +pip --require-venv install -r requirements.txt + +for youtube, there's several options, but for just getting a title, you'll want to use youtube_oembed.py +scraping youtube may or may not work based on your IP and stuff (it should mostly work on home IPs, but tends to get blocked from servers), using curl_ffi to spoof TLS fingerprint may help slightly, I don't handle the POT token stuff yet + +URLget.py is an abstraction that either uses python's builtin urllib.request or, if you have it available, then curl_cffi (which can spoof TLS fingerprinting, though that likely has minimal effect) diff --git a/README.txt b/README.txt deleted file mode 100644 index ac4778c..0000000 --- a/README.txt +++ /dev/null @@ -1,3 +0,0 @@ -code is formatted with "black" -#remember to install the libraries, and you probably should use venvs anyway -pip --require-venv install -r requirements.txt diff --git a/URLget.py b/URLget.py new file mode 100644 index 0000000..0f3f7c7 --- /dev/null +++ b/URLget.py @@ -0,0 +1,45 @@ +import sys + + +class URLgetException(Exception): + pass + + +try: + from curl_cffi import requests + + # from curl_cffi.requests.exceptions import HTTPError + if sys.stderr.isatty(): + print("using curl_cffi", file=sys.stderr) + + def urlget(url): + # probably want to impersonate "chrome", "safari" or "safari_ios" + # could impersonate some more specific versions too I guess + try: + r = requests.get(url, impersonate="safari_ios") + # print(dir(r)) + # print(r.status_code) + except Exception as e: + raise URLgetException(e) + return r.status_code, r.text + +except ModuleNotFoundError: + # fallback to just dumb user-agent spoofing, it will not help, but at least it won't hurt? + from urllib.request import Request, urlopen + + # from urllib.error import HTTPError + if sys.stderr.isatty(): + print("using urllib.request", file=sys.stderr) + + def urlget(url): + # update as needed I guess + ua = "Mozilla/5.0 (X11; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0" + # req=Request(url) + req = Request(url) + req.add_header("User-Agent", ua) + try: + r = urlopen(req) + except Exception as e: + # except HTTPError as e: + raise URLgetException(e) + return r.status, r.read().decode() diff --git a/_.txt b/_.txt new file mode 100644 index 0000000..6737b50 --- /dev/null +++ b/_.txt @@ -0,0 +1,19 @@ +import urllib.request, json + +req = urllib.request.Request("https://www.youtube.com/youtubei/v1/player?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8") +req.add_header("Content-Type", "application/json") +j = json.loads( + urllib.request.urlopen( + req, + b'{"context":{"client":{"clientName":"TVHTML5_SIMPLY_EMBEDDED_PLAYER","clientVersion":"2.0"},"thirdParty":{"embedUrl":"https://www.youtube.com"}},"videoId": "VpGjqueO0b4"}', + ) + .read() + .decode() +) +j = j["videoDetails"] +print( + f'{j["title"]} ({j["lengthSeconds"]} sec), uploaded by {j["author"]}, {j["viewCount"]} views', + j["isCrawlable"], + j["isUnpluggedCorpus"], + j["isLiveContent"], +) diff --git a/applemusic.py b/applemusic.py index 9e10ab9..8334e5b 100755 --- a/applemusic.py +++ b/applemusic.py @@ -78,7 +78,9 @@ class AppleMusic: url = url.rstrip("\x01") title = "" artist = "" - irc_string = "[\x0304AppleMusic\x03] \x0307ERROR:\x0308 got no data from server! \x0315(check your URL for typos!)\x03" + irc_string = ( + "[\x0304AppleMusic\x03] \x0307ERROR:\x0308 got no data from server! \x0315(check your URL for typos!)\x03" + ) ansi_string = "[\x1b[31mAppleMusic\x1b[0m] \x1b[33;2mERROR:\x1b[33;1m got no data from server! \x1b[37;2m(check your URL for typos!)\x1b[0m" if self.is_album(url): title, artist = self.applemusic_oembed(url) @@ -93,9 +95,7 @@ class AppleMusic: if title == "": print(ansi_string) return irc_string, True - irc_string = ( - f"[\x0303AppleMusic\x03] \x02{title}\x02 uploaded by \x1d{artist}\x1d" - ) + irc_string = f"[\x0303AppleMusic\x03] \x02{title}\x02 uploaded by \x1d{artist}\x1d" ansi_string = f"[\x1b[32mAppleMusic\x1b[0m] \x1b[1m{title}\x1b[0m uploaded by \x1b[03m{artist}\x1b[0m" # """ # irc_string="dummy";ansi_string="dummy" diff --git a/bandcamp.py b/bandcamp.py index 2d76b22..727181d 100755 --- a/bandcamp.py +++ b/bandcamp.py @@ -13,12 +13,7 @@ class Bandcamp: self.util.mesg(msg, t) def match_urls(self, str): - r = [ - i - for i in str.split() - if "https://" in i - and ("bandcamp.com/album/" in i or "bandcamp.com/track/" in i) - ] + r = [i for i in str.split() if "https://" in i and ("bandcamp.com/album/" in i or "bandcamp.com/track/" in i)] r = list(dict.fromkeys(r)) n = 0 for i in r: @@ -53,7 +48,9 @@ class Bandcamp: p = self.parseprop() data = urlopen(url).read().decode() p.feed(data) - irc_string = "[\x0304BandCamp\x03] \x0307ERROR:\x0308 got no data from server! \x0315(check your URL for typos!)\x03" + irc_string = ( + "[\x0304BandCamp\x03] \x0307ERROR:\x0308 got no data from server! \x0315(check your URL for typos!)\x03" + ) ansi_string = "[\x1b[31mBandCamp\x1b[0m] \x1b[33;2mERROR:\x1b[33;1m got no data from server! \x1b[37;2m(check your URL for typos!)\x1b[0m" data = json.loads(Bandcamp.ldjson) try: diff --git a/commands.py b/commands.py index 0740916..0493054 100644 --- a/commands.py +++ b/commands.py @@ -150,11 +150,7 @@ class Command: if not ctcp.endswith("\x01"): ctcp = ctcp + "\x01" if ctcp_upper.startswith("PING"): - ctcp = ( - "\x01PING" - + ("" if 1 == len(ctcp.split(" ")) else " ") - + " ".join(ctcp.split(" ")[1:]) - ) + ctcp = "\x01PING" + ("" if 1 == len(ctcp.split(" ")) else " ") + " ".join(ctcp.split(" ")[1:]) print(ctcp) self.notice(ctcp) if ctcp_upper.startswith("SOURCE"): @@ -190,11 +186,7 @@ class Command: args = args + ["", 3] elif len(args) < 3: args = args + ["3"] - for i in ( - popen(f"git log --pretty=oneline --abbrev-commit {args[1]}") - .read() - .split("\n", int(args[2])) - ): + for i in popen(f"git log --pretty=oneline --abbrev-commit {args[1]}").read().split("\n", int(args[2])): mesg(i) else: mesg(popen("git pull").read()) @@ -253,17 +245,12 @@ class Command: amount = 1 if not str(faces).isnumeric() or faces > 100: faces = 6 - mesg( - f"rolling {amount}d{faces}: " - + str([random.choice([i for i in range(faces)]) for n in range(amount)]) - ) + mesg(f"rolling {amount}d{faces}: " + str([random.choice([i for i in range(faces)]) for n in range(amount)])) @cmd def version(self, prefix, cmd, pm, line, admin, mesg): """version""" - mesg( - f"{self.config.self.nick} version {self.getversion()} ({self.config.self.source})" - ) + mesg(f"{self.config.self.nick} version {self.getversion()} ({self.config.self.source})") @adm def dbg2(self, prefix, cmd, pm, line, admin, mesg): diff --git a/config.py b/config.py index 51d4364..ee4cc81 100644 --- a/config.py +++ b/config.py @@ -24,9 +24,7 @@ class config(config): port = 6667 ssl = False nickserv_auth = False - nickserv_mask = ( - "NickServ!NickServ@localhost" # the mask you receive from server - ) + nickserv_mask = "NickServ!NickServ@localhost" # the mask you receive from server nickserv_squery = False # squery seems to only be a thing on ngircd nickserv_path = "NickServ@localhost" # the mask you actually send commands to # get password from secret file diff --git a/requirements.txt b/requirements.txt index 8a70ebc..76a56fb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,3 @@ -ircstates +ircstates +# OPTIONAL: only for TLS fingerprint spoofing, used by youtube.alt.py +curl_cffi diff --git a/soundcloud.py b/soundcloud.py index 7fa6a53..a56dfc8 100755 --- a/soundcloud.py +++ b/soundcloud.py @@ -43,9 +43,7 @@ class SoundCloud: ansi_string = "[\x1b[31mSoundCloud\x1b[0m] \x1b[33;2mERROR:\x1b[33;1m got no data from server! \x1b[37;2m(check your URL for typos!)\x1b[0m" print(ansi_string) return irc_string, True - irc_string = ( - f"[\x0303SoundCloud\x03] \x02{title}\x02 uploaded by \x1d{artist}\x1d" - ) + irc_string = f"[\x0303SoundCloud\x03] \x02{title}\x02 uploaded by \x1d{artist}\x1d" ansi_string = f"[\x1b[32mSoundCloud\x1b[0m] \x1b[1m{title}\x1b[0m uploaded by \x1b[03m{artist}\x1b[0m" # """ # irc_string="dummy";ansi_string="dummy" diff --git a/spotify.py b/spotify.py index e9b0790..609b2e2 100755 --- a/spotify.py +++ b/spotify.py @@ -48,24 +48,22 @@ class Spotify: p = self.parseprop() data = urlopen(url).read().decode() p.feed(data) - irc_string = "[\x0304Spotify\x03] \x0307ERROR:\x0308 got no data from server! \x0315(check your URL for typos!)\x03" + irc_string = ( + "[\x0304Spotify\x03] \x0307ERROR:\x0308 got no data from server! \x0315(check your URL for typos!)\x03" + ) ansi_string = "[\x1b[31mSpotify\x1b[0m] \x1b[33;2mERROR:\x1b[33;1m got no data from server! \x1b[37;2m(check your URL for typos!)\x1b[0m" data = json.loads(Spotify.ldjson) try: type = data["@type"] except KeyError: - print(ansi_string) + if __import__("sys").stdout.isatty(): + print(ansi_string) return irc_string, True id = data["@id"] name = data["name"] date = data["datePublished"] artists = data["description"] - artists = ( - artists.removeprefix(f"Listen to {name} on Spotify") - .removeprefix(".") - .strip() - .removeprefix("· ") - ) + artists = artists.removeprefix(f"Listen to {name} on Spotify").removeprefix(".").strip().removeprefix("· ") if artists.startswith("Song · "): artists = artists.removeprefix("Song · ") elif artists.startswith("Album · "): @@ -77,7 +75,8 @@ class Spotify: ansi_string = f"[\x1b[32mSpotify\x1b[0m] \x1b[1m{name}\x1b[0m by \x1b[03m{artists}\x1b[0m published on {date}" # print(("Song: " if type=="MusicRecording" else "Album: " if type=="MusicAlbum" else f"Unknown type ({type}): ")+'"'+name+'"'+" by "+'"'+artists+'"'+" released on "+date) # irc_string="dummy" - print(ansi_string) + if __import__("sys").stdout.isatty(): + print(ansi_string) return irc_string, False diff --git a/stuff.py b/stuff.py index 299555b..1a873c1 100644 --- a/stuff.py +++ b/stuff.py @@ -39,11 +39,7 @@ def stuff(bot, sock): if nickserv_auth: nick_override = True if config.server.nickserv_squery: - util.send( - irctokens.build( - "SQUERY", ["NickServ", f"IDENTIFY {nick} {passwd}"] - ).format() - ) + util.send(irctokens.build("SQUERY", ["NickServ", f"IDENTIFY {nick} {passwd}"]).format()) util.send( irctokens.build( "SQUERY", @@ -94,11 +90,7 @@ def stuff(bot, sock): send = util.send send(irctokens.build("NICK", [config.self.nick]).format()) - send( - irctokens.build( - "USER", [config.self.username, "0", "*", config.self.realname] - ).format() - ) + send(irctokens.build("USER", [config.self.username, "0", "*", config.self.realname]).format()) while True: self_nick = server.nickname recv_data = sock.recv(1024) @@ -126,9 +118,7 @@ def stuff(bot, sock): auth() if line.command == "433": # 433 is ERR_NICKNAMEINUSE util.nick(config.self.nick + "_") - if ( - line.command == "376" or line.command == "422" - ): # 376 is RPL_ENDOFMOTD and 422 is ERR_NOMOTD + if line.command == "376" or line.command == "422": # 376 is RPL_ENDOFMOTD and 422 is ERR_NOMOTD if config.server.nickserv_auth == True: auth() send(irctokens.build("CAP", ["LS", "302"]).format()) @@ -215,10 +205,7 @@ def stuff(bot, sock): command.prefix = prefix cmd = cmd.strip() try: - is_adm = ( - line.tags["account"] in admin_accounts - or line.source in admin_users - ) + is_adm = line.tags["account"] in admin_accounts or line.source in admin_users except ( KeyError, TypeError, @@ -259,28 +246,18 @@ def stuff(bot, sock): command.util = util command.util.target = target mesg("reloaded") - elif ( - cmd.startswith("eval ") - and "eval" not in config.cmd.disabled - ): + elif cmd.startswith("eval ") and "eval" not in config.cmd.disabled: if is_adm: try: - result = eval( - cmd[len("eval ") :].strip() or "None" - ) + result = eval(cmd[len("eval ") :].strip() or "None") except Exception as e: mesg("Error: " + str(e)) else: mesg("Error: you're not authorized to eval") - elif ( - cmd.startswith("exec ") - and "exec" not in config.cmd.disabled - ): + elif cmd.startswith("exec ") and "exec" not in config.cmd.disabled: if is_adm: try: - result = exec( - cmd[len("exec ") :].strip() or "None" - ) + result = exec(cmd[len("exec ") :].strip() or "None") except Exception as e: mesg("Error: " + str(e)) else: diff --git a/util.py b/util.py index 8538ef3..6525160 100644 --- a/util.py +++ b/util.py @@ -78,9 +78,7 @@ class Util: def action(self, msg: str, t=None): t, msg = self._m(msg, t) - self.send( - irctokens.build("PRIVMSG", [t, "\x01ACTION " + str(msg) + "\x01"]).format() - ) + self.send(irctokens.build("PRIVMSG", [t, "\x01ACTION " + str(msg) + "\x01"]).format()) def notice(self, msg: str, t=None): t, msg = self._m(msg, t) @@ -89,7 +87,5 @@ class Util: def maskmatch(self, string: str, hostmask: str): """DOES NOT HANDLE CASEMAPPING (yet), just dumb case-sensitive match, only ? and * are special""" print("string is", string, "and hostmask is", hostmask) - pat = "[[]".join( - [x.replace("]", "[]]") for x in hostmask.split("[")] - ) # escape all [ and ] into [[] and []] + pat = "[[]".join([x.replace("]", "[]]") for x in hostmask.split("[")]) # escape all [ and ] into [[] and []] return fnmatchcase(string, pat) diff --git a/youtube_abstract.py b/youtube_abstract.py new file mode 100755 index 0000000..f773aa0 --- /dev/null +++ b/youtube_abstract.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python3 +from urllib.parse import urlencode, urlparse, parse_qs +from json import loads as json_loads +from URLget import urlget, URLgetException + +import sys + + +def dbgprint(*args, **kwargs): + if sys.stdout.isatty(): + print(*args, **kwargs) + + +class YouTube: + def __init__(self): + # whether urls with a playlist included, should title the video, or the playlist? + try: + YouTube.prefer_playlist = YouTube.prefer_playlist + except AttributeError: # we probably want video title, default to that + YouTube.prefer_playlist = False + self.irc_pal = { # ignore the fancy alignment BS lol + "rst": "" + "\x0f", ####### reset + "ylw": "" + "\x0307", ##### yellow + "b_ylw": "" "\x0307\x02", # bold yellow + "wht": "" + "\x0315", ##### white + "red": "" + "\x0304", ##### red + "grn": "" + "\x0303", ##### green + "itl": "" + "\x1d", ####### italic + "bld": "" + "\x02", ####### bold + } + self.ansi_pal = { # ignore the fancy alignment BS lol + "rst": "" + "\x1b[0m", #### reset + "ylw": "" + "\x1b[33;2m", # yellow + "b_ylw": "" "\x1b[33;1m", # bold yellow + "wht": "" + "\x1b[37;2m", # white + "red": "" + "\x1b[31m", ### red + "grn": "" + "\x1b[32m", ### green + "itl": "" + "\x1b[03m", ### italic + "bld": "" + "\x1b[1m", #### bold + } + + def mesg(self, msg, t=None): # just an alias to shorten full name + self.util.mesg(msg, t) + + def match_urls(self, str, r=[]): + if str.startswith("http://"): + str = "https://" + str[7:] + if str.startswith("https://youtube."): + str = "https://www." + str[8:] + if str.startswith("https://"): # first string has to be trimmed before calling match_urls + if ( # I'm just doing fancy BS to align the urls nicely, lol, ignore this + str.startswith("https://youtu.be/") + or str.startswith("" "" "" "" "https://www.youtube.com/playlist?") ####### playlist + or str.startswith("" "" "" "https://music.youtube.com/playlist?") + or str.startswith("" "" "" "" "https://m.youtube.com/playlist?") + or str.startswith("" "" "" "" "https://www.youtube.com/shorts/") ######### shorts + or str.startswith("" "" "" "" "" "https://youtube.com/shorts/") + or str.startswith("" "" "" "" "https://m.youtube.com/shorts/") + or str.startswith("" "" "" "" "https://www.youtube.com/watch?") ########## normal + or str.startswith("" "" "" "https://music.youtube.com/watch?") + or str.startswith("" "" "" "" "https://m.youtube.com/watch?") + or str.startswith("https://www.youtube-nocookie.com/embed/") ############# embed + or str.startswith("" "" "" "https://www.youtube.com/embed/") + or str.startswith("" "" "" "" "https://m.youtube.com/embed/") + or str.startswith("https://www.youtube-nocookie.com/embed/videoseries?") # embed playlist + or str.startswith("" "" "" "https://www.youtube.com/embed/videoseries?") + or str.startswith("" "" "" "" "https://m.youtube.com/embed/videoseries?") + ): + r += [str[: str.find(" ")]] # make array of all matching "words" (urls) + i = str.find(" ") + 1 + return ( + match_urls(self, str[i:].strip(), r=r) if i != 0 else r + ) # recurse down each word, see if anything matches + + # makes for a little better syntax than a bunch of str.startswith calls + def matchstart(self, str, *arr): + for i in arr: + if str.startswith(i): + return True + return False + + def is_clip(self, str): + return self.matchstart(str, "https://youtube.com/clip/", "https://www.youtube.com/clip/") + + # boil down to video id + playlist id + def normalize_url(self, url): + dbgprint("normalize", url) + raw_url, videoId, listId = url, "", "" + # youtu.be + if self.matchstart(url, "https://youtu.be/"): + videoId = url.split("/")[3].split("?")[0] + dbgprint("youtu.be") + elif self.matchstart(url, "https://www.youtube.com/shorts/", "https://m.youtube.com/shorts/"): + videoId = url.split("?")[0].split("/")[-1] + dbgprint("/shorts", videoId) + # embed + elif self.matchstart( + url, + "https://m.youtube.com/embed/", + "https://www.youtube.com/embed/", + "https://www.youtube-nocookie.com/embed/", + ): + try: + listId = parse_qs(urlparse(url).query)["list"][0] + except KeyError: + if not url.split("/")[4].startswith("videoseries"): + videoId = url.split("/")[4] + dbgprint("embed", videoId, listId) + elif "v=" in url: # handles yt music, normal url, etc + for i in url.split("?")[1].split("&"): + if i[0:2] == "v=": + videoId = i[2:] + elif i[0:5] == "list=": + listId = i[5:] + if "videoId" in locals(): + url = "https://www.youtube.com/watch?" + if "videoId" in locals(): + if videoId != "": + url += f"v={videoId}" + if "listId" in locals(): + if listId != "": + if not url.endswith("?"): + url += "&" + url += f"list={listId}" + print("clean url", url) + return url + + # very close to normalize_url, maybe could reorganize better? + def normalize_playlist(self, url): + url = urlparse(url) + qs = parse_qs(url.query) + try: + video_id = qs["v"][0] + except KeyError: + video_id = None + try: + playlist_id = qs["list"][0] + # ignore the programmatic "mix" / "radio" lists, actual playlists start with "PL" + if playlist_id.startswith("RD"): + playlist_id = None + except KeyError: + playlist_id = None + if (self.prefer_playlist and playlist_id) or (playlist_id and not video_id): + url = url.scheme + "://" + url.netloc + "/playlist?list=" + playlist_id + elif video_id: + url = url.scheme + "://" + url.netloc + url.path + "?v=" + video_id + else: + self.setstring("string", "{{i}_prefix_err} unable to detect video ID!{pal['rst']}") + return {"irc": irc_string, "ansi": ansi_string}, True + return url + + # set both irc_name and ansi_name, using the appropriate palette + def setstring(self, name, val, mylocals=locals()): + prefixes = ["irc", "ansi"] + for i in prefixes: + value = val.replace("{i}", i) + mylocals.update(locals()) # merge the local variables + exec( + f"global {i}_{name}; pal=self.{i}_pal; {i}_{name}=f{repr(value)}", + globals(), + mylocals, + ) + + def yt(self, url): + self.setstring( + "prefix", + "[{pal['grn']}YouTube{pal['rst']}]", + ) + self.setstring( + "prefix_err", + "[{pal['red']}YouTube{pal['rst']}] {pal['ylw']}ERROR:{pal['b_ylw']}", + ) + self.setstring( + "string", + "{{i}_prefix_err} got no data from server! {pal['wht']}(check your URL for typos!){pal['rst']}", + ) + url = url.rstrip("\x01") # I forget exactly why, might be due to /me ? + url = self.normalize_url(url) + url = self.normalize_playlist(url) + url = f"https://www.youtube.com/oembed?{urlencode([('url',url),('format','json')])}" + try: + # print(url, " and ", playlist_id) + status, data = urlget(url) + if status != 200: + self.setstring("string", "{{i}_prefix_err} {status}{pal['rst']}", locals()) + return {"irc": irc_string, "ansi": ansi_string}, True + data = json_loads(data) + title, channelName = data["title"], data["author_name"] + except URLgetException as e: + self.setstring("string", "{{i}_prefix_err} {e}{pal['rst']}", locals()) + if __import__("sys").stdout.isatty(): + print(ansi_string) + return {"irc": irc_string, "ansi": ansi_string}, True + self.setstring( + "string", + "{{i}_prefix} {pal['bld']}{title}{pal['rst']} uploaded by {pal['itl']}{channelName}{pal['rst']}", + mylocals=locals(), + ) + if __import__("sys").stdout.isatty(): + print("ansi", ansi_string) + print("irc", irc_string) + return {"irc": irc_string, "ansi": ansi_string}, False + + +if __name__ == "__main__": + import sys + + # if url is a video that's part of a playlist, return playlist (True) or video (False, default)? + # YouTube.prefer_playlist=False + + # YouTube.yt(YouTube, sys.argv[1]) + # YouTube().yt(sys.argv[1]) + YT = YouTube() + print(YT.match_urls(sys.argv[1])) + YT.yt(sys.argv[1]) diff --git a/youtube_oembed.py b/youtube_oembed.py new file mode 100755 index 0000000..4ed427c --- /dev/null +++ b/youtube_oembed.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +from urllib.parse import urlencode, urlparse, parse_qs +from json import loads as json_loads +from URLget import urlget, URLgetException + + +class YouTube: + def __init__(self): + try: + YouTube.prefer_playlist = YouTube.prefer_playlist + except AttributeError: + YouTube.prefer_playlist = False + + def mesg(self, msg, t=None): + self.util.mesg(msg, t) + + def match_urls(self, str): + str = str.replace("http://", "https://") + r = [ + i + for i in str.split() + if "https://youtu.be/" in i + or "https://www.youtube.com/watch?v=" in i + or "https://m.youtube.com/watch?v=" in i + or "https://youtube.com/watch?v=" in i + or "https://www.youtube.com/embed/" in i + or "https://www.youtube-nocookie.com/embed/" in i + or "https://music.youtube.com/watch?v=" in i + or "https://youtube.com/shorts/" in i + or "https://www.youtube.com/shorts/" in i + ] + r = list(dict.fromkeys(r)) + n = 0 + for i in r: + if not i.startswith("http"): + r.pop(n) + n += 1 + + return r + + def is_embed(self, *str): + if type(self) == type("a"): + str = self + else: + str = str[0] + return str.startswith("https://www.youtube.com/embed/") or str.startswith( + "https://www.youtube-nocookie.com/embed/" + ) + + def is_ytmusic(self, *str): + if type(self) == type("a"): + str = self + else: + str = str[0] + return str.startswith("https://music.youtube.com/watch?v=") + + def is_ytshorts(self, *str): + if type(self) == type("a"): + str = self + else: + str = str[0] + return str.startswith("https://youtube.com/shorts/") or str.startswith("https://www.youtube.com/shorts/") + + def is_clip(self, *str): + if type(self) == type("a"): + str = self + else: + str = str[0] + return str.startswith("https://youtube.com/clip/") or str.startswith("https://www.youtube.com/clip/") + + def is_shorturl(self, *str): + if type(self) == type("a"): + str = self + else: + str = str[0] + return str.startswith("https://youtu.be/") + + def yt(self, url): + irc_string = ( + "[\x0304YouTube\x03] \x0307ERROR:\x0308 got no data from server! \x0315(check your URL for typos!)\x03" + ) + ansi_string = "[\x1b[31mYouTube\x1b[0m] \x1b[33;2mERROR:\x1b[33;1m got no data from server! \x1b[37;2m(check your URL for typos!)\x1b[0m" + # self.util.mesg("dbg hello") + url = url.rstrip("\x01") + if self.is_embed(url): + videoId = url.split("/")[4] + url = f"https://www.youtube.com/watch?v={videoId}" + elif self.is_ytmusic(url): + for i in url.split("?")[1].split("&"): + if i[0:2] == "v=": + videoId = i[2:] + url = f"https://www.youtube.com/watch?v={videoId}" + elif self.is_ytshorts(url): + videoId = url.split("?")[0].split("/")[-1] + url = f"https://www.youtube.com/watch?v={videoId}" + elif self.is_shorturl(url): + videoId = url.split("/")[3].split("?")[0] + url = f"https://www.youtube.com/watch?v={videoId}" + url = urlparse(url) + qs = parse_qs(url.query) + try: + video_id = qs["v"][0] + except KeyError: + video_id = None + try: + playlist_id = qs["list"][0] + # ignore the "random mix" and "radio" lists + if playlist_id.startswith("RD"): + playlist_id = None + except KeyError: + playlist_id = None + if (self.prefer_playlist and playlist_id) or (playlist_id and not video_id): + url = url.scheme + "://" + url.netloc + "/playlist?list=" + playlist_id + else: + url = url.scheme + "://" + url.netloc + url.path + "?v=" + video_id + url = f"https://www.youtube.com/oembed?{urlencode([('url',url),('format','json')])}" + try: + # print(url, " and ", playlist_id) + status, data = urlget(url) + if status != 200: + irc_string = f"[\x0304YouTube\x03] \x0307ERROR:\x0308 {status} \x0315\x03" + ansi_string = f"[\x1b[31mYouTube\x1b[0m] \x1b[33;2mERROR:\x1b[33;1m {status} \x1b[37;2m\x1b[0m" + data = json_loads(data) + title = data["title"] + channelName = data["author_name"] + except URLgetException as e: + irc_string = f"[\x0304YouTube\x03] \x0307ERROR:\x0308 {e} \x0315\x03" + ansi_string = f"[\x1b[31mYouTube\x1b[0m] \x1b[33;2mERROR:\x1b[33;1m {e} \x1b[37;2m\x1b[0m" + if __import__("sys").stdout.isatty(): + print(ansi_string) + return irc_string, True + irc_string = f"[\x0303YouTube\x03] \x02{title}\x02 uploaded by \x1d{channelName}\x1d" + ansi_string = f"[\x1b[32mYouTube\x1b[0m] \x1b[1m{title}\x1b[0m uploaded by \x1b[03m{channelName}\x1b[0m" + if __import__("sys").stdout.isatty(): + print(ansi_string) + return irc_string, False + + +if __name__ == "__main__": + import sys + + # if url is a video that's part of a playlist, + # return playlist (True) or video (False, default)? + # YouTube.prefer_playlist=False + YouTube().yt(sys.argv[1]) + # YouTube.yt(YouTube, sys.argv[1]) diff --git a/youtube.py b/youtube_oembed_old.py index 1e60546..9719757 100755 --- a/youtube.py +++ b/youtube_oembed_old.py @@ -16,6 +16,7 @@ class YouTube: self.util.mesg(msg, t) def match_urls(self, str): + str = str.replace("http://", "https://") r = [ i for i in str.split() @@ -59,21 +60,19 @@ class YouTube: str = self else: str = str[0] - return str.startswith("https://youtube.com/shorts/") or str.startswith( - "https://www.youtube.com/shorts/" - ) + return str.startswith("https://youtube.com/shorts/") or str.startswith("https://www.youtube.com/shorts/") def is_clip(self, *str): if type(self) == type("a"): str = self else: str = str[0] - return str.startswith("https://youtube.com/clip/") or str.startswith( - "https://www.youtube.com/clip/" - ) + return str.startswith("https://youtube.com/clip/") or str.startswith("https://www.youtube.com/clip/") def yt(self, url): - irc_string = "[\x0304Youtube\x03] \x0307ERROR:\x0308 got no data from server! \x0315(check your URL for typos!)\x03" + irc_string = ( + "[\x0304Youtube\x03] \x0307ERROR:\x0308 got no data from server! \x0315(check your URL for typos!)\x03" + ) ansi_string = "[\x1b[31mYoutube\x1b[0m] \x1b[33;2mERROR:\x1b[33;1m got no data from server! \x1b[37;2m(check your URL for typos!)\x1b[0m" # self.util.mesg("dbg hello") url = url.rstrip("\x01") @@ -96,7 +95,7 @@ class YouTube: video_id = None try: playlist_id = qs["list"][0] - #ignore the "random mix" and "radio" lists + # ignore the "random mix" and "radio" lists if playlist_id.startswith("RD"): playlist_id = None except KeyError: @@ -117,9 +116,7 @@ class YouTube: ansi_string = f"[\x1b[31mYoutube\x1b[0m] \x1b[33;2mERROR:\x1b[33;1m {e} \x1b[37;2m\x1b[0m" print(ansi_string) return irc_string, True - irc_string = ( - f"[\x0303Youtube\x03] \x02{title}\x02 uploaded by \x1d{channelName}\x1d" - ) + irc_string = f"[\x0303Youtube\x03] \x02{title}\x02 uploaded by \x1d{channelName}\x1d" ansi_string = f"[\x1b[32mYoutube\x1b[0m] \x1b[1m{title}\x1b[0m uploaded by \x1b[03m{channelName}\x1b[0m" print(ansi_string) return irc_string, False diff --git a/youtube.py.old b/youtube_scrape.py index b09b1d1..951af3b 100755 --- a/youtube.py.old +++ b/youtube_scrape.py @@ -1,15 +1,27 @@ #!/usr/bin/env python3 from html.parser import HTMLParser -from urllib.request import urlopen -from urllib.error import HTTPError +from URLget import urlget, URLgetException + +# from URLget import URLgetException +# urlget=URLget().urlget +# print(urlget("http://ip.envs.net")) +# print(dir(URLget)) + class YouTube: + # crude import, lol + # URLget = URLget().URLget + # def __init__(self): + # self.URLget = URLget().URLget + # print(URLget,URLget.URLget) + video_type = "" def mesg(self, msg, t=None): self.util.mesg(msg, t) def match_urls(self, str): + str = str.replace("http://", "https://") r = [ i for i in str.split() @@ -34,40 +46,42 @@ class YouTube: return r - def is_embed(str): + def is_embed(self, str): return str.startswith("https://www.youtube.com/embed/") or str.startswith( "https://www.youtube-nocookie.com/embed/" ) - def is_ytmusic(str): + def is_ytmusic(self, str): return str.startswith("https://music.youtube.com/watch?v=") - def is_ytshorts(str): - return str.startswith("https://youtube.com/shorts/") or str.startswith( - "https://www.youtube.com/shorts/" - ) + def is_ytshorts(self, str): + return str.startswith("https://youtube.com/shorts/") or str.startswith("https://www.youtube.com/shorts/") - def is_clip(str): - return str.startswith("https://youtube.com/clip/") or str.startswith( - "https://www.youtube.com/clip/" - ) + def is_clip(self, str): + return str.startswith("https://youtube.com/clip/") or str.startswith("https://www.youtube.com/clip/") class parseprop(HTMLParser): def __init__(self): - #print("yt parse init") + # print("yt parse init") HTMLParser.__init__(self) self.itemprops_list = ["name", "duration", "uploadDate", "interactionCount"] self.h = {} if YouTube.video_type == "clip": self.itemprops_list += ["description"] print("it is a clip!") + self.title = False + + def handle_data(self, data): + if self.title != False: + # print("title",data) + self.h.update({"html_title": data}) + self.title = False def handle_starttag(self, tag, attrs): + if tag == "title": + self.title = True if (tag != "meta" and tag != "link") or ( - ( - [i for i in attrs if "itemprop" in i] == [] - and ("name", "title") not in attrs - ) + ([i for i in attrs if "itemprop" in i] == [] and ("name", "title") not in attrs) or (tag == "meta" and ("itemprop", "name") in attrs) ): return @@ -109,7 +123,9 @@ class YouTube: return f"{m}m {s}s" def yt(self, url): - irc_string = "[\x0304Youtube\x03] \x0307ERROR:\x0308 got no data from server! \x0315(check your URL for typos!)\x03" + irc_string = ( + "[\x0304Youtube\x03] \x0307ERROR:\x0308 got no data from server! \x0315(check your URL for typos!)\x03" + ) ansi_string = "[\x1b[31mYoutube\x1b[0m] \x1b[33;2mERROR:\x1b[33;1m got no data from server! \x1b[37;2m(check your URL for typos!)\x1b[0m" # self.util.mesg("dbg hello") url = url.rstrip("\x01") @@ -119,11 +135,7 @@ class YouTube: else ( "shorts" if self.is_ytshorts(url) - else ( - "music" - if self.is_ytmusic(url) - else "embed" if self.is_embed(url) else "video" - ) + else ("music" if self.is_ytmusic(url) else "embed" if self.is_embed(url) else "video") ) ) video_type = self.video_type @@ -139,30 +151,28 @@ class YouTube: videoId = url.split("?")[0].split("/")[-1] url = f"https://www.youtube.com/watch?v={videoId}" p = self.parseprop() - # use premature optimization? it should be SLIGHTLY faster, but can sometimes fail data = b"" - if self.premature_optimization: - url_h = urlopen(url) - # <body> appears on approximately line 21 or 22, so we read 24 lines to be safe (23-25 should be license comment) - # I tried to read byte amounts but it's hard to make sure no invalid utf8 bytes happen due to partial reads - for i in range(24): - data += url_h.readline() - url_h.close() data = data.decode() # bytes to utf-8 - if ( - data.find('meta itemprop="duration"') == -1 - or data.find('meta itemprop="name"') == -1 - ): # acts as both fallback for optimization, and in case optimization's turned off - # just read all of the html - try: data = urlopen(url).read().decode() - except HTTPError as e: - irc_string = f"[\x0304Youtube\x03] \x0307ERROR:\x0308 {e} \x0315\x03" - ansi_string = f"[\x1b[31mYoutube\x1b[0m] \x1b[33;2mERROR:\x1b[33;1m {e} \x1b[37;2m\x1b[0m" + if data.find('meta itemprop="duration"') == -1 or data.find('meta itemprop="name"') == -1: + try: + status, data = urlget(url) + if status != 200: + irc_string = f"[\x0304Youtube\x03] \x0307ERROR:\x0308 {status} \x0315\x03" + ansi_string = f"[\x1b[31mYoutube\x1b[0m] \x1b[33;2mERROR:\x1b[33;1m {status} \x1b[37;2m\x1b[0m" + except URLgetException as e: + irc_string = f"[\x0304Youtube\x03] \x0307ERROR:\x0308 {e} \x0315\x03" + ansi_string = f"[\x1b[31mYoutube\x1b[0m] \x1b[33;2mERROR:\x1b[33;1m {e} \x1b[37;2m\x1b[0m" # print(f"\x1b[31m my data is: {data}\x1b[0m") + print(data) p.feed(data) if p.h == {}: print(ansi_string) return irc_string, True + elif p.h == {"html_title": "YouTube"}: + irc_string = "[\x0304Youtube\x03] \x0307ERROR:\x0308 flagged as bot \x0315\x03" + ansi_string = "[\x1b[31mYoutube\x1b[0m] \x1b[33;2mERROR:\x1b[33;1m flagged as bot \x1b[37;2m\x1b[0m" + print(ansi_string) + return irc_string, True y = p.h print(y) y.update(duration=self.fmt_dur(y["duration"])) @@ -176,4 +186,5 @@ if __name__ == "__main__": import sys YouTube.premature_optimization = False - YouTube.yt(YouTube, sys.argv[1]) + # YouTube.yt(YouTube, sys.argv[1]) + YouTube().yt(sys.argv[1]) |