diff --git a/src/searchengine/nova3/helpers.py b/src/searchengine/nova3/helpers.py index ef8376a28..abf201439 100644 --- a/src/searchengine/nova3/helpers.py +++ b/src/searchengine/nova3/helpers.py @@ -1,4 +1,4 @@ -#VERSION: 1.49 +#VERSION: 1.50 # Author: # Christophe DUMEZ (chris@qbittorrent.org) @@ -29,12 +29,13 @@ import datetime import gzip -import html.entities +import html import io import os import re import socket import socks +import ssl import sys import tempfile import urllib.error @@ -72,29 +73,16 @@ if "sock_proxy" in os.environ and len(os.environ["sock_proxy"].strip()) > 0: socket.socket = socks.socksocket # type: ignore[misc] -def htmlentitydecode(s: str) -> str: - # First convert alpha entities (such as é) - # (Inspired from http://mail.python.org/pipermail/python-list/2007-June/443813.html) - def entity2char(m: re.Match[str]) -> str: - entity = m.group(1) - if entity in html.entities.name2codepoint: - return chr(html.entities.name2codepoint[entity]) - return " " # Unknown entity: We replace with a space. - t = re.sub('&(%s);' % '|'.join(html.entities.name2codepoint), entity2char, s) - - # Then convert numerical entities (such as é) - t = re.sub(r'&#(\d+);', lambda x: chr(int(x.group(1))), t) - - # Then convert hexa entities (such as é) - return re.sub(r'&#x(\w+);', lambda x: chr(int(x.group(1), 16)), t) +# This is only provided for backward compatibility, new code should not use it +htmlentitydecode = html.unescape -def retrieve_url(url: str, custom_headers: Mapping[str, Any] = {}, request_data: Optional[Any] = None) -> str: +def retrieve_url(url: str, custom_headers: Mapping[str, Any] = {}, request_data: Optional[Any] = None, ssl_context: Optional[ssl.SSLContext] = None) -> str: """ Return the content of the url page as a string """ request = urllib.request.Request(url, request_data, {**headers, **custom_headers}) try: - response = urllib.request.urlopen(request) + response = urllib.request.urlopen(request, context=ssl_context) except urllib.error.URLError as errno: print(f"Connection error: {errno.reason}", file=sys.stderr) return "" @@ -117,14 +105,14 @@ def retrieve_url(url: str, custom_headers: Mapping[str, Any] = {}, request_data: return dataStr -def download_file(url: str, referer: Optional[str] = None) -> str: +def download_file(url: str, referer: Optional[str] = None, ssl_context: Optional[ssl.SSLContext] = None) -> str: """ Download file at url and write it to a file, return the path to the file and the url """ # Download url request = urllib.request.Request(url, headers=headers) if referer is not None: request.add_header('referer', referer) - response = urllib.request.urlopen(request) + response = urllib.request.urlopen(request, context=ssl_context) data = response.read() # Check if it is gzipped