import html
import re
import urllib
from urllib import request
from FaustBot.Communication.Connection import Connection
from FaustBot.Modules.PrivMsgObserverPrototype import PrivMsgObserverPrototype
class TitleObserver(PrivMsgObserverPrototype):
@staticmethod
def cmd():
return None
@staticmethod
def help():
return None
def update_on_priv_msg(self, data, connection: Connection):
regex = "(?Phttps?://[^\s]+)"
url = re.search(regex, data["messageCaseSensitive"])
if url is not None:
url = url.group()
print(url)
try:
title = self.getTitle(url)
print(title)
title = title[:350]
connection.send_back(title, data)
except Exception as exc:
print(exc)
pass
def getTitle(self, url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
}
if re.search("https?://\\[[^/]*", url):
raise (Exception("Refusing to parse bare IPv6 Addresses"))
if re.search("https?://[^/:]*:[^/:]*", url):
raise (Exception("Refusing to parse URLs with Ports"))
if re.search("https?://[0-9]+.[0-9]+.[0-9]+.[^/]*", url):
raise (Exception("Refusing to parse bare IPv4 Addresses"))
if re.search("https?://music.youtube.com/", url):
url = url.replace("music.youtube.com/", "www.youtube.com/", 1)
if re.search("https?://youtu.be/", url):
url = url.replace("youtu.be/", "www.youtube.com/watch?v=", 1)
if re.search("https?://[^/]*youtube.com/shorts/", url):
title_re = re.compile(
'''"reelPlayerHeaderRenderer":{"reelTitleText":{"runs":\[{"text":"([^"]*)"'''
)
headers["User-Agent"] = "curl/7.81.0"
elif re.search("https?://[^/]*youtube.com/", url):
title_re = re.compile(
'''"results":{"contents":\[{"videoPrimaryInfoRenderer":{"title":{"runs":\[{"text":"([^"]*)"'''
)
else:
title_re = re.compile("]*>(.+?)")
req = urllib.request.Request(url, None, headers)
# Keep the urlopen scope as short as possible (connection leaks)
with urllib.request.urlopen(req, timeout=10) as response:
encoding = response.headers.get_content_charset()
content_raw = response.read()
# der erste Fall kann raus, wenn ein anderer Channel benutzt wird
if url.find("rehakids.de") != -1:
encoding = "windows-1252"
if not encoding:
encoding = "utf-8"
content = content_raw.decode(encoding, errors="replace")
title_matches = title_re.search(content)
if title_matches:
title = title_matches.group(1)
else:
# with open("content.html", "w") as file:
# file.write(content)
raise Exception(f"Could not Parse Title for {url}")
title = html.unescape(title)
title = title.replace("\n", " ").replace("\r", "")
title = title.replace("<", "<")
title = title.replace(">", ">")
title = title.replace("&", "&")
if title == "":
raise Exception(f"Empty Title for {url}")
return title