Browse Source

Add rel="nofollow" to outgoing links and cut domain off inner links #67

Ralfp 12 years ago
parent
commit
d6a29829e0

+ 37 - 0
misago/markdown/extensions/cleanlinks.py

@@ -0,0 +1,37 @@
+import markdown
+from markdown.util import etree
+from misago.utils.urls import is_url, is_inner, clean_inner
+
+class CleanLinksExtension(markdown.Extension):
+    def extendMarkdown(self, md):
+        md.registerExtension(self)
+        md.treeprocessors.add('mi_cleanlinks',
+                              CleanLinksTreeprocessor(md),
+                              '_end')
+
+
+class CleanLinksTreeprocessor(markdown.treeprocessors.Treeprocessor):
+    def run(self, root):
+        self.inurl = False
+        return self.walk_tree(root)
+
+    def walk_tree(self, node):
+        if node.tag == 'a':
+            self.inurl = True
+            if is_inner(node.get('href')):
+                node.set('href', clean_inner(node.get('href')))
+            else:
+                node.set('rel', 'nofollow')
+        if node.tag == 'img':
+            if is_inner(node.get('src')):
+                node.set('src', '%s' % clean_inner(node.get('src')))
+
+        try:
+            if self.inurl and is_url(node.text):
+                node.text = clean_inner(node.text)[1:]
+        except TypeError:
+            pass
+            
+        for i in node:
+            self.walk_tree(i)
+        self.inurl = False

+ 8 - 9
misago/markdown/extensions/magiclinks.py

@@ -4,6 +4,8 @@ import markdown
 from markdown.inlinepatterns import LinkPattern
 from markdown.inlinepatterns import LinkPattern
 from markdown.postprocessors import RawHtmlPostprocessor
 from markdown.postprocessors import RawHtmlPostprocessor
 from markdown.util import etree
 from markdown.util import etree
+from misago.utils.strings import html_escape
+from misago.utils.urls import is_inner, clean_inner
 
 
 # Global vars
 # Global vars
 MAGICLINKS_RE = re.compile(r'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', re.UNICODE)
 MAGICLINKS_RE = re.compile(r'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', re.UNICODE)
@@ -25,8 +27,11 @@ class MagicLinksTreeprocessor(markdown.treeprocessors.Treeprocessor):
             link = LinkPattern(MAGICLINKS_RE, self.markdown)
             link = LinkPattern(MAGICLINKS_RE, self.markdown)
             href = link.sanitize_url(link.unescape(matchobj.group(0).strip()))
             href = link.sanitize_url(link.unescape(matchobj.group(0).strip()))
             if href:
             if href:
-                href = self.escape(href)
-                return self.markdown.htmlStash.store('<a href="%(href)s">%(href)s</a>' % {'href': href}, safe=True)
+                if is_inner(href):
+                    clean = clean_inner(href)
+                    return self.markdown.htmlStash.store('<a href="%s">%s</a>' % (clean, clean[1:]), safe=True)
+                else:
+                    return self.markdown.htmlStash.store('<a href="%(href)s" rel="nofollow">%(href)s</a>' % {'href': href}, safe=True)
             else:
             else:
                 return matchobj.group(0)
                 return matchobj.group(0)
 
 
@@ -36,10 +41,4 @@ class MagicLinksTreeprocessor(markdown.treeprocessors.Treeprocessor):
             if node.tail and unicode(node.tail).strip():
             if node.tail and unicode(node.tail).strip():
                 node.tail = MAGICLINKS_RE.sub(parse_link, unicode(node.tail))
                 node.tail = MAGICLINKS_RE.sub(parse_link, unicode(node.tail))
             for i in node:
             for i in node:
-                self.walk_tree(i)
-
-    def escape(self, html):
-        html = html.replace('&', '&amp;')
-        html = html.replace('<', '&lt;')
-        html = html.replace('>', '&gt;')
-        return html.replace('"', '&quot;')
+                self.walk_tree(i)

+ 7 - 38
misago/markdown/factory.py

@@ -1,48 +1,14 @@
 import re
 import re
 import markdown
 import markdown
-from HTMLParser import HTMLParser
 from django.conf import settings
 from django.conf import settings
 from django.utils.importlib import import_module
 from django.utils.importlib import import_module
 from django.utils.translation import ugettext_lazy as _
 from django.utils.translation import ugettext_lazy as _
 from misago.utils.strings import random_string
 from misago.utils.strings import random_string
-
-class ClearHTMLParser(HTMLParser):
-    def __init__(self):
-        HTMLParser.__init__(self)
-        self.clean_text = ''
-        self.lookback = []
-        
-    def handle_entityref(self, name):
-        if name == 'gt':
-            self.clean_text += '>'
-        if name == 'lt':
-            self.clean_text += '<'
-
-    def handle_starttag(self, tag, attrs):
-        self.lookback.append(tag)
-
-    def handle_endtag(self, tag):
-        try:
-            if self.lookback[-1] == tag:
-                self.lookback.pop()
-        except IndexError:
-            pass
-        
-    def handle_data(self, data):
-        # String does not repeat itself
-        if self.clean_text[-len(data):] != data:
-            # String is not "QUOTE"
-            try:
-                if self.lookback[-1] in ('strong', 'em'):
-                    self.clean_text += data
-                elif not (data == 'Quote' and self.lookback[-1] == 'h3' and self.lookback[-2] == 'blockquote'):
-                    self.clean_text += data
-            except IndexError:
-                self.clean_text += data
-
+from misago.markdown.extensions.cleanlinks import CleanLinksExtension
+from misago.markdown.parsers import RemoveHTMLParser
 
 
 def clear_markdown(text):
 def clear_markdown(text):
-    parser = ClearHTMLParser()
+    parser = RemoveHTMLParser()
     parser.feed(text)
     parser.feed(text)
     return parser.clean_text
     return parser.clean_text
 
 
@@ -62,6 +28,8 @@ def signature_markdown(acl, text):
                            extensions=['nl2br'])
                            extensions=['nl2br'])
 
 
     remove_unsupported(md)
     remove_unsupported(md)
+    cleanlinks = CleanLinksExtension()
+    cleanlinks.extendMarkdown(md)
 
 
     if not acl.usercp.allow_signature_links():
     if not acl.usercp.allow_signature_links():
         del md.inlinePatterns['link']
         del md.inlinePatterns['link']
@@ -96,7 +64,8 @@ def post_markdown(request, text):
         ext = attr()
         ext = attr()
         ext.extendMarkdown(md)
         ext.extendMarkdown(md)
     text = md.convert(text)
     text = md.convert(text)
-    return tidy_markdown(md, text)
+    md, text = tidy_markdown(md, text)
+    return md, text
 
 
 
 
 def tidy_markdown(md, text):
 def tidy_markdown(md, text):

+ 38 - 0
misago/markdown/parsers.py

@@ -0,0 +1,38 @@
+from HTMLParser import HTMLParser
+from urlparse import urlparse
+from django.conf import settings
+from misago.utils.strings import random_string
+
+class RemoveHTMLParser(HTMLParser):
+    def __init__(self):
+        HTMLParser.__init__(self)
+        self.clean_text = ''
+        self.lookback = []
+        
+    def handle_entityref(self, name):
+        if name == 'gt':
+            self.clean_text += '>'
+        if name == 'lt':
+            self.clean_text += '<'
+
+    def handle_starttag(self, tag, attrs):
+        self.lookback.append(tag)
+
+    def handle_endtag(self, tag):
+        try:
+            if self.lookback[-1] == tag:
+                self.lookback.pop()
+        except IndexError:
+            pass
+        
+    def handle_data(self, data):
+        # String does not repeat itself
+        if self.clean_text[-len(data):] != data:
+            # String is not "QUOTE"
+            try:
+                if self.lookback[-1] in ('strong', 'em'):
+                    self.clean_text += data
+                elif not (data == 'Quote' and self.lookback[-1] == 'h3' and self.lookback[-2] == 'blockquote'):
+                    self.clean_text += data
+            except IndexError:
+                self.clean_text += data

+ 1 - 0
misago/settings_base.py

@@ -144,6 +144,7 @@ MARKDOWN_EXTENSIONS = (
     'misago.markdown.extensions.quotes.QuoteTitlesExtension',
     'misago.markdown.extensions.quotes.QuoteTitlesExtension',
     'misago.markdown.extensions.mentions.MentionsExtension',
     'misago.markdown.extensions.mentions.MentionsExtension',
     'misago.markdown.extensions.magiclinks.MagicLinksExtension',
     'misago.markdown.extensions.magiclinks.MagicLinksExtension',
+    'misago.markdown.extensions.cleanlinks.CleanLinksExtension',
 )
 )
 
 
 # Name of root urls configuration
 # Name of root urls configuration

+ 6 - 0
misago/utils/strings.py

@@ -27,3 +27,9 @@ def short_string(string, length=16):
     if len(bits[-1]) < 3:
     if len(bits[-1]) < 3:
         bits.pop()
         bits.pop()
     return '%s...' % (' '.join(bits))
     return '%s...' % (' '.join(bits))
+
+def html_escape(html):
+    html = html.replace('&', '&amp;')
+    html = html.replace('<', '&lt;')
+    html = html.replace('>', '&gt;')
+    return html.replace('"', '&quot;')

+ 24 - 0
misago/utils/urls.py

@@ -0,0 +1,24 @@
+#-*- coding: utf-8 -*-
+import re
+from urlparse import urlparse
+from django.conf import settings
+from misago.utils.strings import html_escape
+
+URL_RE = re.compile(r'^(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))$', re.UNICODE)
+
+def is_url(string):
+    return URL_RE.search(string.strip()) != None
+
+
+def is_inner(string):
+    return urlparse(string.strip()).netloc.lower() == urlparse(settings.BOARD_ADDRESS.lower()).netloc
+
+
+def clean_inner(string):
+    parsed = urlparse(string.strip())
+    href = parsed.path
+    if parsed.query:
+        href += '?%s' % parsed.query
+    if parsed.fragment:
+        href += '#%s' % parsed.fragment
+    return html_escape(href)