123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199 |
- import re
- from typing import Union
- from django.http import Http404
- from django.urls import resolve
- from .htmlparser import ElementNode, RootNode, TextNode
- MISAGO_ATTACHMENT_VIEWS = ("misago:attachment", "misago:attachment-thumbnail")
- URL_RE = re.compile(
- r"(https?://)?"
- r"(www\.)?"
- r"(\w+((-|_)\w+)?\.)?"
- r"\w+((_|-|\w)+)?(\.[a-z][a-z]+)"
- r"(:[1-9][0-9]+)?"
- r"([^\s<>\[\]\(\);:]+)?"
- )
- def linkify_texts(node: Union[RootNode, ElementNode]):
- # Skip link replacement in some nodes
- if node.tag in ("pre", "code", "a"):
- return
- new_children = []
- for child in node.children:
- if isinstance(child, TextNode):
- if URL_RE.search(child.text):
- new_children += replace_links_in_text(child.text)
- else:
- new_children.append(child)
- else:
- new_children.append(child)
- linkify_texts(child)
- node.children = new_children
- def replace_links_in_text(text: str) -> list:
- nodes = []
- while True:
- match = URL_RE.search(text)
- if not match:
- if text:
- nodes.append(TextNode(text=text))
- return nodes
- start, end = match.span()
- url = text[start:end]
- # Append text between 0 and start to nodes
- if start > 0:
- nodes.append(TextNode(text=text[:start]))
- nodes.append(
- ElementNode(
- tag="a",
- attrs={"href": url},
- children=[
- TextNode(text=strip_link_protocol(url)),
- ],
- )
- )
- text = text[end:]
- def clean_links(
- request,
- result,
- node: Union[RootNode, ElementNode, TextNode],
- force_shva=False,
- ):
- if isinstance(node, TextNode):
- return
- for child in node.children:
- if not isinstance(child, ElementNode):
- continue
- if child.tag == "a":
- clean_link_node(request, result, child, force_shva)
- clean_links(request, result, child, force_shva)
- elif child.tag == "img":
- clean_image_node(request, result, child, force_shva)
- else:
- clean_links(request, result, child, force_shva)
- def clean_link_node(
- request,
- result: dict,
- node: ElementNode,
- force_shva: bool,
- ):
- host = request.get_host()
- href = node.attrs.get("href") or "/"
- if is_internal_link(href, host):
- href = clean_internal_link(href, host)
- result["internal_links"].append(href)
- href = clean_attachment_link(href, force_shva)
- else:
- result["outgoing_links"].append(strip_link_protocol(href))
- href = assert_link_prefix(href)
- node.attrs["rel"] = "external nofollow noopener"
- node.attrs["target"] = "_blank"
- node.attrs["href"] = href
- if len(node.children) == 0:
- node.children.append(strip_link_protocol(href))
- elif len(node.children) == 1 and isinstance(node.children[0], TextNode):
- text = node.children[0].text
- if URL_RE.match(text):
- node.children[0].text = strip_link_protocol(text)
- def clean_image_node(
- request,
- result: dict,
- node: ElementNode,
- force_shva: bool,
- ):
- host = request.get_host()
- src = node.attrs.get("src") or "/"
- node.attrs["alt"] = strip_link_protocol(node.attrs["alt"])
- if is_internal_link(src, host):
- src = clean_internal_link(src, host)
- result["images"].append(src)
- src = clean_attachment_link(src, force_shva)
- else:
- result["images"].append(strip_link_protocol(src))
- src = assert_link_prefix(src)
- node.attrs["src"] = src
- def is_internal_link(link, host):
- if link.startswith("/") and not link.startswith("//"):
- return True
- link = strip_link_protocol(link).lstrip("www.").lower()
- return link.lower().startswith(host.lstrip("www."))
- def strip_link_protocol(link):
- if link.lower().startswith("https:"):
- link = link[6:]
- if link.lower().startswith("http:"):
- link = link[5:]
- if link.startswith("//"):
- link = link[2:]
- return link
- def assert_link_prefix(link):
- if link.lower().startswith("https:"):
- return link
- if link.lower().startswith("http:"):
- return link
- if link.startswith("//"):
- return "http:%s" % link
- return "http://%s" % link
- def clean_internal_link(link, host):
- link = strip_link_protocol(link)
- if link.lower().startswith("www."):
- link = link[4:]
- if host.lower().startswith("www."):
- host = host[4:]
- if link.lower().startswith(host):
- link = link[len(host) :]
- return link or "/"
- def clean_attachment_link(link, force_shva=False):
- try:
- resolution = resolve(link)
- if not resolution.namespaces:
- return link
- url_name = ":".join(resolution.namespaces + [resolution.url_name])
- except (Http404, ValueError):
- return link
- if url_name in MISAGO_ATTACHMENT_VIEWS:
- if force_shva:
- link = "%s?shva=1" % link
- elif link.endswith("?shva=1"):
- link = link[:-7]
- return link
|