Merge pull request #12 from ctx77/main

Adding youtube-special-case handling to extract title
This commit is contained in:
BaerbelBox
2024-08-01 07:49:36 +02:00
committed by GitHub

View File

@@ -18,12 +18,14 @@ class TitleObserver(PrivMsgObserverPrototype):
def update_on_priv_msg(self, data, connection: Connection): def update_on_priv_msg(self, data, connection: Connection):
regex = "(?P<url>https?://[^\s]+)" regex = "(?P<url>https?://[^\s]+)"
url = re.search(regex, data['messageCaseSensitive']) url = re.search(regex, data["messageCaseSensitive"])
if url is not None: if url is not None:
url = url.group() url = url.group()
print(url) print(url)
try: try:
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'} headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
}
url = url url = url
req = urllib.request.Request(url, None, headers) req = urllib.request.Request(url, None, headers)
@@ -35,20 +37,35 @@ class TitleObserver(PrivMsgObserverPrototype):
except Exception as exc: except Exception as exc:
print(exc) print(exc)
pass pass
def getTitle(self, resource): def getTitle(self, resource):
encoding = resource.headers.get_content_charset() encoding = resource.headers.get_content_charset()
# der erste Fall kann raus, wenn ein anderer Channel benutzt wird url = resource.geturl()
if resource.geturl().find('rehakids.de') != -1: # der erste Fall kann raus, wenn ein anderer Channel benutzt wird
encoding = 'windows-1252' if url.find("rehakids.de") != -1:
if not encoding: encoding = "windows-1252"
encoding = 'utf-8' if not encoding:
content = resource.read().decode(encoding, errors='replace') encoding = "utf-8"
content = resource.read().decode(encoding, errors="replace")
if re.search("http[s]+://[^/]*youtube.com/", url):
title_re = re.compile(
'''"results":{"contents":\[{"videoPrimaryInfoRenderer":{"title":{"runs":\[{"text":"([^"]*)"'''
)
else:
title_re = re.compile("<title>(.+?)</title>") title_re = re.compile("<title>(.+?)</title>")
title = title_re.search(content).group(1)
title = html.unescape(title) title_matches = title_re.search(content)
title = title.replace('\n', ' ').replace('\r', '') if title_matches:
title = title.replace("&lt;", "<") title = title_matches.group(1)
title = title.replace("&gt;", ">") else:
title = title.replace("&amp;", "&") return "Could not Parse Title"
return title
title = html.unescape(title)
title = title.replace("\n", " ").replace("\r", "")
title = title.replace("&lt;", "<")
title = title.replace("&gt;", ">")
title = title.replace("&amp;", "&")
if title == "":
title = "Empty Title"
return title