Rafał Pitoń 8 лет назад
Родитель
Сommit
222cc2da11
2 измененных файлов с 72 добавлено и 22 удалено
  1. 45 21
      misago/markup/parser.py
  2. 27 1
      misago/markup/tests/test_parser.py

+ 45 - 21
misago/markup/parser.py

@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 import markdown
 
 import bleach
@@ -119,41 +121,63 @@ def linkify_paragraphs(result):
 
 
 def clean_links(request, result):
+    host = request.get_host()
     site_address = '%s://%s' % (request.scheme, request.get_host())
 
     soup = BeautifulSoup(result['parsed_text'], 'html5lib')
     for link in soup.find_all('a'):
-        if link['href'].lower().startswith(site_address):
-            result['inside_links'].append(link['href'])
-            if link['href'].lower() == site_address:
-                link['href'] = '/'
-            else:
-                link['href'] = link['href'][len(site_address):]
+        if is_internal_link(link['href'], host):
+            link['href'] = clean_internal_link(link['href'], host)
         else:
             result['outgoing_links'].append(link['href'])
 
-        if link.string.startswith('http://'):
-            link.string.replace_with(link.string[7:].strip())
-        if link.string.startswith('https://'):
-            link.string.replace_with(link.string[8:].strip())
+        if link.string:
+            link.string = clean_link_prefix(link.string)
 
     for img in soup.find_all('img'):
-        result['images'].append(img['src'])
-        if img['src'].lower().startswith(site_address):
-            if img['src'].lower() == site_address:
-                img['src'] = '/'
-            else:
-                img['src'] = img['src'][len(site_address):]
-
-        if img['alt'].startswith('http://'):
-            img['alt'] = img['alt'][7:].strip()
-        if img['alt'].startswith('https://'):
-            img['alt'] = img['alt'][8:].strip()
+        img['alt'] = clean_link_prefix(img['alt'])
+        if is_internal_link(img['src'], host):
+            img['src'] = clean_internal_link(img['src'], host)
+            result['images'].append(img['src'])
+        else:
+            result['images'].append(img['src'])
 
     # [6:-7] trims <body></body> wrap
     result['parsed_text'] = six.text_type(soup.body)[6:-7]
 
 
+def is_internal_link(link, host):
+    if link.startswith('/') and not link.startswith('//'):
+        return True
+
+    link = clean_link_prefix(link).lstrip('www.').lower()
+    return link.lower().startswith(host.lstrip('www.'))
+
+
+def clean_link_prefix(link):
+    if link.lower().startswith('https:'):
+        link = link[6:]
+    if link.lower().startswith('http:'):
+        link = link[5:]
+    if link.startswith('//'):
+        link = link[2:]
+    return link
+
+
+def clean_internal_link(link, host):
+    link = clean_link_prefix(link)
+
+    if link.lower().startswith('www.'):
+        link = link[4:]
+    if host.lower().startswith('www.'):
+        host = host[4:]
+
+    if link.lower().startswith(host):
+        link = link[len(host):]
+
+    return link or '/'
+
+
 def minify_result(result):
     # [25:-14] trims <html><head></head><body> and </body></html>
     result['parsed_text'] = html_minify(result['parsed_text'].encode('utf-8'))

+ 27 - 1
misago/markup/tests/test_parser.py

@@ -124,7 +124,7 @@ Hey there @{}, how's going?
 
 class CleanLinksTests(TestCase):
     def test_clean_current_link(self):
-        """clean_links step leaves http://test.com alone"""
+        """clean_links step cleans http://test.com"""
         test_text = """
 Lorem ipsum: http://test.com
 """.strip()
@@ -136,6 +136,19 @@ Lorem ipsum: http://test.com
         result = parse(test_text, MockRequest(), MockPoster(), minify=True)
         self.assertEqual(expected_result, result['parsed_text'])
 
+    def test_clean_schemaless_link(self):
+        """clean_links step cleans test.com"""
+        test_text = """
+Lorem ipsum: test.com
+""".strip()
+
+        expected_result = """
+<p>Lorem ipsum: <a href="/" rel="nofollow">test.com</a></p>
+""".strip()
+
+        result = parse(test_text, MockRequest(), MockPoster(), minify=True)
+        self.assertEqual(expected_result, result['parsed_text'])
+
     def test_trim_current_path(self):
         """clean_links step leaves http://test.com path"""
         test_text = """
@@ -200,3 +213,16 @@ Lorem ipsum: http://somewhere.com/somewhere-something/
 
         result = parse(test_text, MockRequest(), MockPoster(), minify=True)
         self.assertEqual(expected_result, result['parsed_text'])
+
+    def test_clean_linked_image(self):
+        """parser handles image element nested in link"""
+        test_text = """
+[![3.png](http://test.com/attachment/thumb/test-43/)](http://test.com/attachment/test-43/)
+        """
+
+        expected_result = """
+<p><a href="/attachment/test-43/" rel="nofollow"><img alt="3.png" src="/attachment/thumb/test-43/"/></a></p>
+""".strip()
+
+        result = parse(test_text, MockRequest(), MockPoster(), minify=True)
+        self.assertEqual(expected_result, result['parsed_text'])