The stubs have some issues so this has some generous cast and ignores in it, but it is better than not having stubs. Note that confusing that Element is a function which creates _Element instances (and similarly for Comment).tags/v1.86.0rc1
@@ -0,0 +1 @@ | |||
Improve type hints. |
@@ -60,9 +60,6 @@ ignore_missing_imports = True | |||
[mypy-ijson.*] | |||
ignore_missing_imports = True | |||
[mypy-lxml] | |||
ignore_missing_imports = True | |||
# https://github.com/msgpack/msgpack-python/issues/448 | |||
[mypy-msgpack] | |||
ignore_missing_imports = True | |||
@@ -1,4 +1,4 @@ | |||
# This file is automatically @generated by Poetry and should not be changed by hand. | |||
# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. | |||
[[package]] | |||
name = "alabaster" | |||
@@ -1215,6 +1215,21 @@ html5 = ["html5lib"] | |||
htmlsoup = ["BeautifulSoup4"] | |||
source = ["Cython (>=0.29.7)"] | |||
[[package]] | |||
name = "lxml-stubs" | |||
version = "0.4.0" | |||
description = "Type annotations for the lxml package" | |||
category = "dev" | |||
optional = false | |||
python-versions = "*" | |||
files = [ | |||
{file = "lxml-stubs-0.4.0.tar.gz", hash = "sha256:184877b42127256abc2b932ba8bd0ab5ea80bd0b0fee618d16daa40e0b71abee"}, | |||
{file = "lxml_stubs-0.4.0-py3-none-any.whl", hash = "sha256:3b381e9e82397c64ea3cc4d6f79d1255d015f7b114806d4826218805c10ec003"}, | |||
] | |||
[package.extras] | |||
test = ["coverage[toml] (==5.2)", "pytest (>=6.0.0)", "pytest-mypy-plugins (==1.9.3)"] | |||
[[package]] | |||
name = "markdown-it-py" | |||
version = "2.2.0" | |||
@@ -3409,22 +3424,22 @@ docs = ["Sphinx", "repoze.sphinx.autointerface"] | |||
test = ["zope.i18nmessageid", "zope.testing", "zope.testrunner"] | |||
[extras] | |||
all = ["matrix-synapse-ldap3", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pysaml2", "authlib", "lxml", "sentry-sdk", "jaeger-client", "opentracing", "txredisapi", "hiredis", "Pympler", "pyicu"] | |||
all = ["Pympler", "authlib", "hiredis", "jaeger-client", "lxml", "matrix-synapse-ldap3", "opentracing", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pyicu", "pysaml2", "sentry-sdk", "txredisapi"] | |||
cache-memory = ["Pympler"] | |||
jwt = ["authlib"] | |||
matrix-synapse-ldap3 = ["matrix-synapse-ldap3"] | |||
oidc = ["authlib"] | |||
opentracing = ["jaeger-client", "opentracing"] | |||
postgres = ["psycopg2", "psycopg2cffi", "psycopg2cffi-compat"] | |||
redis = ["txredisapi", "hiredis"] | |||
redis = ["hiredis", "txredisapi"] | |||
saml2 = ["pysaml2"] | |||
sentry = ["sentry-sdk"] | |||
systemd = ["systemd-python"] | |||
test = ["parameterized", "idna"] | |||
test = ["idna", "parameterized"] | |||
url-preview = ["lxml"] | |||
user-search = ["pyicu"] | |||
[metadata] | |||
lock-version = "2.0" | |||
python-versions = "^3.7.1" | |||
content-hash = "ef3a16dd66177f7141239e1a2d3e07cc14c08f1e4e0c5127184d022bc062da52" | |||
content-hash = "7ad11e62a675e09444cf33ca2de3216fc4efc5874a2575e54d95d577a52439d3" |
@@ -314,6 +314,7 @@ black = ">=22.3.0" | |||
ruff = "0.0.265" | |||
# Typechecking | |||
lxml-stubs = ">=0.4.0" | |||
mypy = "*" | |||
mypy-zope = "*" | |||
types-bleach = ">=4.1.0" | |||
@@ -14,7 +14,7 @@ | |||
import html | |||
import logging | |||
import urllib.parse | |||
from typing import TYPE_CHECKING, List, Optional | |||
from typing import TYPE_CHECKING, List, Optional, cast | |||
import attr | |||
@@ -98,7 +98,7 @@ class OEmbedProvider: | |||
# No match. | |||
return None | |||
def autodiscover_from_html(self, tree: "etree.Element") -> Optional[str]: | |||
def autodiscover_from_html(self, tree: "etree._Element") -> Optional[str]: | |||
""" | |||
Search an HTML document for oEmbed autodiscovery information. | |||
@@ -109,18 +109,22 @@ class OEmbedProvider: | |||
The URL to use for oEmbed information, or None if no URL was found. | |||
""" | |||
# Search for link elements with the proper rel and type attributes. | |||
for tag in tree.xpath( | |||
"//link[@rel='alternate'][@type='application/json+oembed']" | |||
# Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this. | |||
for tag in cast( | |||
List["etree._Element"], | |||
tree.xpath("//link[@rel='alternate'][@type='application/json+oembed']"), | |||
): | |||
if "href" in tag.attrib: | |||
return tag.attrib["href"] | |||
return cast(str, tag.attrib["href"]) | |||
# Some providers (e.g. Flickr) use alternative instead of alternate. | |||
for tag in tree.xpath( | |||
"//link[@rel='alternative'][@type='application/json+oembed']" | |||
# Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this. | |||
for tag in cast( | |||
List["etree._Element"], | |||
tree.xpath("//link[@rel='alternative'][@type='application/json+oembed']"), | |||
): | |||
if "href" in tag.attrib: | |||
return tag.attrib["href"] | |||
return cast(str, tag.attrib["href"]) | |||
return None | |||
@@ -212,11 +216,12 @@ class OEmbedProvider: | |||
return OEmbedResult(open_graph_response, author_name, cache_age) | |||
def _fetch_urls(tree: "etree.Element", tag_name: str) -> List[str]: | |||
def _fetch_urls(tree: "etree._Element", tag_name: str) -> List[str]: | |||
results = [] | |||
for tag in tree.xpath("//*/" + tag_name): | |||
# Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this. | |||
for tag in cast(List["etree._Element"], tree.xpath("//*/" + tag_name)): | |||
if "src" in tag.attrib: | |||
results.append(tag.attrib["src"]) | |||
results.append(cast(str, tag.attrib["src"])) | |||
return results | |||
@@ -244,11 +249,12 @@ def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> | |||
parser = etree.HTMLParser(recover=True, encoding="utf-8") | |||
# Attempt to parse the body. If this fails, log and return no metadata. | |||
tree = etree.fromstring(html_body, parser) | |||
# TODO Develop of lxml-stubs has this correct. | |||
tree = etree.fromstring(html_body, parser) # type: ignore[arg-type] | |||
# The data was successfully parsed, but no tree was found. | |||
if tree is None: | |||
return | |||
return # type: ignore[unreachable] | |||
# Attempt to find interesting URLs (images, videos, embeds). | |||
if "og:image" not in open_graph_response: | |||
@@ -24,6 +24,7 @@ from typing import ( | |||
Optional, | |||
Set, | |||
Union, | |||
cast, | |||
) | |||
if TYPE_CHECKING: | |||
@@ -115,7 +116,7 @@ def _get_html_media_encodings( | |||
def decode_body( | |||
body: bytes, uri: str, content_type: Optional[str] = None | |||
) -> Optional["etree.Element"]: | |||
) -> Optional["etree._Element"]: | |||
""" | |||
This uses lxml to parse the HTML document. | |||
@@ -152,11 +153,12 @@ def decode_body( | |||
# Attempt to parse the body. Returns None if the body was successfully | |||
# parsed, but no tree was found. | |||
return etree.fromstring(body, parser) | |||
# TODO Develop of lxml-stubs has this correct. | |||
return etree.fromstring(body, parser) # type: ignore[arg-type] | |||
def _get_meta_tags( | |||
tree: "etree.Element", | |||
tree: "etree._Element", | |||
property: str, | |||
prefix: str, | |||
property_mapper: Optional[Callable[[str], Optional[str]]] = None, | |||
@@ -175,9 +177,15 @@ def _get_meta_tags( | |||
Returns: | |||
A map of tag name to value. | |||
""" | |||
# This actually returns Dict[str, str], but the caller sets this as a variable | |||
# which is Dict[str, Optional[str]]. | |||
results: Dict[str, Optional[str]] = {} | |||
for tag in tree.xpath( | |||
f"//*/meta[starts-with(@{property}, '{prefix}:')][@content][not(@content='')]" | |||
# Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this. | |||
for tag in cast( | |||
List["etree._Element"], | |||
tree.xpath( | |||
f"//*/meta[starts-with(@{property}, '{prefix}:')][@content][not(@content='')]" | |||
), | |||
): | |||
# if we've got more than 50 tags, someone is taking the piss | |||
if len(results) >= 50: | |||
@@ -187,14 +195,15 @@ def _get_meta_tags( | |||
) | |||
return {} | |||
key = tag.attrib[property] | |||
key = cast(str, tag.attrib[property]) | |||
if property_mapper: | |||
key = property_mapper(key) | |||
new_key = property_mapper(key) | |||
# None is a special value used to ignore a value. | |||
if key is None: | |||
if new_key is None: | |||
continue | |||
key = new_key | |||
results[key] = tag.attrib["content"] | |||
results[key] = cast(str, tag.attrib["content"]) | |||
return results | |||
@@ -219,7 +228,7 @@ def _map_twitter_to_open_graph(key: str) -> Optional[str]: | |||
return "og" + key[7:] | |||
def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]: | |||
def parse_html_to_open_graph(tree: "etree._Element") -> Dict[str, Optional[str]]: | |||
""" | |||
Parse the HTML document into an Open Graph response. | |||
@@ -276,24 +285,36 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]: | |||
if "og:title" not in og: | |||
# Attempt to find a title from the title tag, or the biggest header on the page. | |||
title = tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()") | |||
# Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this. | |||
title = cast( | |||
List["etree._ElementUnicodeResult"], | |||
tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()"), | |||
) | |||
if title: | |||
og["og:title"] = title[0].strip() | |||
else: | |||
og["og:title"] = None | |||
if "og:image" not in og: | |||
meta_image = tree.xpath( | |||
"//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')]/@content[1]" | |||
# Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this. | |||
meta_image = cast( | |||
List["etree._ElementUnicodeResult"], | |||
tree.xpath( | |||
"//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')]/@content[1]" | |||
), | |||
) | |||
# If a meta image is found, use it. | |||
if meta_image: | |||
og["og:image"] = meta_image[0] | |||
else: | |||
# Try to find images which are larger than 10px by 10px. | |||
# Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this. | |||
# | |||
# TODO: consider inlined CSS styles as well as width & height attribs | |||
images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]") | |||
images = cast( | |||
List["etree._Element"], | |||
tree.xpath("//img[@src][number(@width)>10][number(@height)>10]"), | |||
) | |||
images = sorted( | |||
images, | |||
key=lambda i: ( | |||
@@ -302,20 +323,29 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]: | |||
) | |||
# If no images were found, try to find *any* images. | |||
if not images: | |||
images = tree.xpath("//img[@src][1]") | |||
# Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this. | |||
images = cast(List["etree._Element"], tree.xpath("//img[@src][1]")) | |||
if images: | |||
og["og:image"] = images[0].attrib["src"] | |||
og["og:image"] = cast(str, images[0].attrib["src"]) | |||
# Finally, fallback to the favicon if nothing else. | |||
else: | |||
favicons = tree.xpath("//link[@href][contains(@rel, 'icon')]/@href[1]") | |||
# Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this. | |||
favicons = cast( | |||
List["etree._ElementUnicodeResult"], | |||
tree.xpath("//link[@href][contains(@rel, 'icon')]/@href[1]"), | |||
) | |||
if favicons: | |||
og["og:image"] = favicons[0] | |||
if "og:description" not in og: | |||
# Check the first meta description tag for content. | |||
meta_description = tree.xpath( | |||
"//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]" | |||
# Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this. | |||
meta_description = cast( | |||
List["etree._ElementUnicodeResult"], | |||
tree.xpath( | |||
"//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]" | |||
), | |||
) | |||
# If a meta description is found with content, use it. | |||
if meta_description: | |||
@@ -332,7 +362,7 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]: | |||
return og | |||
def parse_html_description(tree: "etree.Element") -> Optional[str]: | |||
def parse_html_description(tree: "etree._Element") -> Optional[str]: | |||
""" | |||
Calculate a text description based on an HTML document. | |||
@@ -368,6 +398,9 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]: | |||
"canvas", | |||
"img", | |||
"picture", | |||
# etree.Comment is a function which creates an etree._Comment element. | |||
# The "tag" attribute of an etree._Comment instance is confusingly the | |||
# etree.Comment function instead of a string. | |||
etree.Comment, | |||
} | |||
@@ -381,8 +414,8 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]: | |||
def _iterate_over_text( | |||
tree: Optional["etree.Element"], | |||
tags_to_ignore: Set[Union[str, "etree.Comment"]], | |||
tree: Optional["etree._Element"], | |||
tags_to_ignore: Set[object], | |||
stack_limit: int = 1024, | |||
) -> Generator[str, None, None]: | |||
"""Iterate over the tree returning text nodes in a depth first fashion, | |||
@@ -402,7 +435,7 @@ def _iterate_over_text( | |||
# This is a stack whose items are elements to iterate over *or* strings | |||
# to be returned. | |||
elements: List[Union[str, "etree.Element"]] = [tree] | |||
elements: List[Union[str, "etree._Element"]] = [tree] | |||
while elements: | |||
el = elements.pop() | |||
@@ -24,7 +24,7 @@ from tests import unittest | |||
try: | |||
import lxml | |||
except ImportError: | |||
lxml = None | |||
lxml = None # type: ignore[assignment] | |||
class SummarizeTestCase(unittest.TestCase): | |||
@@ -160,6 +160,7 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase): | |||
""" | |||
tree = decode_body(html, "http://example.com/test.html") | |||
assert tree is not None | |||
og = parse_html_to_open_graph(tree) | |||
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) | |||
@@ -176,6 +177,7 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase): | |||
""" | |||
tree = decode_body(html, "http://example.com/test.html") | |||
assert tree is not None | |||
og = parse_html_to_open_graph(tree) | |||
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) | |||
@@ -195,6 +197,7 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase): | |||
""" | |||
tree = decode_body(html, "http://example.com/test.html") | |||
assert tree is not None | |||
og = parse_html_to_open_graph(tree) | |||
self.assertEqual( | |||
@@ -217,6 +220,7 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase): | |||
""" | |||
tree = decode_body(html, "http://example.com/test.html") | |||
assert tree is not None | |||
og = parse_html_to_open_graph(tree) | |||
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) | |||
@@ -231,6 +235,7 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase): | |||
""" | |||
tree = decode_body(html, "http://example.com/test.html") | |||
assert tree is not None | |||
og = parse_html_to_open_graph(tree) | |||
self.assertEqual(og, {"og:title": None, "og:description": "Some text."}) | |||
@@ -246,6 +251,7 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase): | |||
""" | |||
tree = decode_body(html, "http://example.com/test.html") | |||
assert tree is not None | |||
og = parse_html_to_open_graph(tree) | |||
self.assertEqual(og, {"og:title": "Title", "og:description": "Title"}) | |||
@@ -261,6 +267,7 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase): | |||
""" | |||
tree = decode_body(html, "http://example.com/test.html") | |||
assert tree is not None | |||
og = parse_html_to_open_graph(tree) | |||
self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."}) | |||
@@ -281,6 +288,7 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase): | |||
""" | |||
tree = decode_body(html, "http://example.com/test.html") | |||
assert tree is not None | |||
og = parse_html_to_open_graph(tree) | |||
self.assertEqual(og, {"og:title": "Title", "og:description": "Finally!"}) | |||
@@ -296,6 +304,7 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase): | |||
""" | |||
tree = decode_body(html, "http://example.com/test.html") | |||
assert tree is not None | |||
og = parse_html_to_open_graph(tree) | |||
self.assertEqual(og, {"og:title": None, "og:description": "Some text."}) | |||
@@ -324,6 +333,7 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase): | |||
<head><title>Foo</title></head><body>Some text.</body></html> | |||
""".strip() | |||
tree = decode_body(html, "http://example.com/test.html") | |||
assert tree is not None | |||
og = parse_html_to_open_graph(tree) | |||
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) | |||
@@ -338,6 +348,7 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase): | |||
</html> | |||
""" | |||
tree = decode_body(html, "http://example.com/test.html", "invalid-encoding") | |||
assert tree is not None | |||
og = parse_html_to_open_graph(tree) | |||
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) | |||
@@ -353,6 +364,7 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase): | |||
</html> | |||
""" | |||
tree = decode_body(html, "http://example.com/test.html") | |||
assert tree is not None | |||
og = parse_html_to_open_graph(tree) | |||
self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."}) | |||
@@ -367,6 +379,7 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase): | |||
</html> | |||
""" | |||
tree = decode_body(html, "http://example.com/test.html") | |||
assert tree is not None | |||
og = parse_html_to_open_graph(tree) | |||
self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."}) | |||
@@ -380,6 +393,7 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase): | |||
</html> | |||
""" | |||
tree = decode_body(html, "http://example.com/test.html") | |||
assert tree is not None | |||
og = parse_html_to_open_graph(tree) | |||
self.assertEqual( | |||
og, | |||
@@ -401,6 +415,7 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase): | |||
</html> | |||
""" | |||
tree = decode_body(html, "http://example.com/test.html") | |||
assert tree is not None | |||
og = parse_html_to_open_graph(tree) | |||
self.assertEqual( | |||
og, | |||
@@ -419,6 +434,7 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase): | |||
with a cheeky SVG</svg></u> and <strong>some</strong> tail text</b></a> | |||
""" | |||
tree = decode_body(html, "http://example.com/test.html") | |||
assert tree is not None | |||
og = parse_html_to_open_graph(tree) | |||
self.assertEqual( | |||
og, | |||
@@ -28,7 +28,7 @@ from tests.unittest import HomeserverTestCase | |||
try: | |||
import lxml | |||
except ImportError: | |||
lxml = None | |||
lxml = None # type: ignore[assignment] | |||
class OEmbedTests(HomeserverTestCase): | |||
@@ -24,7 +24,7 @@ from tests.unittest import override_config | |||
try: | |||
import lxml | |||
except ImportError: | |||
lxml = None | |||
lxml = None # type: ignore[assignment] | |||
class URLPreviewTests(unittest.HomeserverTestCase): | |||
@@ -40,7 +40,7 @@ from tests.test_utils import SMALL_PNG | |||
try: | |||
import lxml | |||
except ImportError: | |||
lxml = None | |||
lxml = None # type: ignore[assignment] | |||
class URLPreviewTests(unittest.HomeserverTestCase): | |||