links.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199
  1. import re
  2. from typing import Union
  3. from django.http import Http404
  4. from django.urls import resolve
  5. from .htmlparser import ElementNode, RootNode, TextNode
  6. MISAGO_ATTACHMENT_VIEWS = ("misago:attachment", "misago:attachment-thumbnail")
  7. URL_RE = re.compile(
  8. r"(https?://)?"
  9. r"(www\.)?"
  10. r"(\w+((-|_)\w+)?\.)?"
  11. r"\w+((_|-|\w)+)?(\.[a-z][a-z]+)"
  12. r"(:[1-9][0-9]+)?"
  13. r"([^\s<>\[\]\(\);:]+)?"
  14. )
  15. def linkify_texts(node: Union[RootNode, ElementNode]):
  16. # Skip link replacement in some nodes
  17. if node.tag in ("pre", "code", "a"):
  18. return
  19. new_children = []
  20. for child in node.children:
  21. if isinstance(child, TextNode):
  22. if URL_RE.search(child.text):
  23. new_children += replace_links_in_text(child.text)
  24. else:
  25. new_children.append(child)
  26. else:
  27. new_children.append(child)
  28. linkify_texts(child)
  29. node.children = new_children
  30. def replace_links_in_text(text: str) -> list:
  31. nodes = []
  32. while True:
  33. match = URL_RE.search(text)
  34. if not match:
  35. if text:
  36. nodes.append(TextNode(text=text))
  37. return nodes
  38. start, end = match.span()
  39. url = text[start:end]
  40. # Append text between 0 and start to nodes
  41. if start > 0:
  42. nodes.append(TextNode(text=text[:start]))
  43. nodes.append(
  44. ElementNode(
  45. tag="a",
  46. attrs={"href": url},
  47. children=[
  48. TextNode(text=strip_link_protocol(url)),
  49. ],
  50. )
  51. )
  52. text = text[end:]
  53. def clean_links(
  54. request,
  55. result,
  56. node: Union[RootNode, ElementNode, TextNode],
  57. force_shva=False,
  58. ):
  59. if isinstance(node, TextNode):
  60. return
  61. for child in node.children:
  62. if not isinstance(child, ElementNode):
  63. continue
  64. if child.tag == "a":
  65. clean_link_node(request, result, child, force_shva)
  66. clean_links(request, result, child, force_shva)
  67. elif child.tag == "img":
  68. clean_image_node(request, result, child, force_shva)
  69. else:
  70. clean_links(request, result, child, force_shva)
  71. def clean_link_node(
  72. request,
  73. result: dict,
  74. node: ElementNode,
  75. force_shva: bool,
  76. ):
  77. host = request.get_host()
  78. href = node.attrs.get("href") or "/"
  79. if is_internal_link(href, host):
  80. href = clean_internal_link(href, host)
  81. result["internal_links"].append(href)
  82. href = clean_attachment_link(href, force_shva)
  83. else:
  84. result["outgoing_links"].append(strip_link_protocol(href))
  85. href = assert_link_prefix(href)
  86. node.attrs["rel"] = "external nofollow noopener"
  87. node.attrs["target"] = "_blank"
  88. node.attrs["href"] = href
  89. if len(node.children) == 0:
  90. node.children.append(strip_link_protocol(href))
  91. elif len(node.children) == 1 and isinstance(node.children[0], TextNode):
  92. text = node.children[0].text
  93. if URL_RE.match(text):
  94. node.children[0].text = strip_link_protocol(text)
  95. def clean_image_node(
  96. request,
  97. result: dict,
  98. node: ElementNode,
  99. force_shva: bool,
  100. ):
  101. host = request.get_host()
  102. src = node.attrs.get("src") or "/"
  103. node.attrs["alt"] = strip_link_protocol(node.attrs["alt"])
  104. if is_internal_link(src, host):
  105. src = clean_internal_link(src, host)
  106. result["images"].append(src)
  107. src = clean_attachment_link(src, force_shva)
  108. else:
  109. result["images"].append(strip_link_protocol(src))
  110. src = assert_link_prefix(src)
  111. node.attrs["src"] = src
  112. def is_internal_link(link, host):
  113. if link.startswith("/") and not link.startswith("//"):
  114. return True
  115. link = strip_link_protocol(link).lstrip("www.").lower()
  116. return link.lower().startswith(host.lstrip("www."))
  117. def strip_link_protocol(link):
  118. if link.lower().startswith("https:"):
  119. link = link[6:]
  120. if link.lower().startswith("http:"):
  121. link = link[5:]
  122. if link.startswith("//"):
  123. link = link[2:]
  124. return link
  125. def assert_link_prefix(link):
  126. if link.lower().startswith("https:"):
  127. return link
  128. if link.lower().startswith("http:"):
  129. return link
  130. if link.startswith("//"):
  131. return "http:%s" % link
  132. return "http://%s" % link
  133. def clean_internal_link(link, host):
  134. link = strip_link_protocol(link)
  135. if link.lower().startswith("www."):
  136. link = link[4:]
  137. if host.lower().startswith("www."):
  138. host = host[4:]
  139. if link.lower().startswith(host):
  140. link = link[len(host) :]
  141. return link or "/"
  142. def clean_attachment_link(link, force_shva=False):
  143. try:
  144. resolution = resolve(link)
  145. if not resolution.namespaces:
  146. return link
  147. url_name = ":".join(resolution.namespaces + [resolution.url_name])
  148. except (Http404, ValueError):
  149. return link
  150. if url_name in MISAGO_ATTACHMENT_VIEWS:
  151. if force_shva:
  152. link = "%s?shva=1" % link
  153. elif link.endswith("?shva=1"):
  154. link = link[:-7]
  155. return link