Changed URL validation to use urlparse instead of regex for spliting the URL

Enabled test_is_url in Python 3 since it is now passing This might be one of the last fixes to #1353 Fixes #1598
2017-06-07 04:59:03 +01:00
parent 85ecebc3a4
commit 2a33c0faff
3 changed files with 121 additions and 129 deletions
--- a/gluon/tests/init.py
+++ b/gluon/tests/init.py
@@ -24,7 +24,7 @@ from .test_web import *
 from .test_sqlhtml import *
 from .test_scheduler import *
 from .test_cron import *
+from .test_is_url import *

 if sys.version[:3] == '2.7':
-    from .test_is_url import *
    from .test_old_doctests import *
--- a/gluon/tests/test_is_url.py
+++ b/gluon/tests/test_is_url.py
@@ -586,81 +586,81 @@ class TestUnicode(unittest.TestCase):
               # disables prepending the scheme in the return value

    def testUnicodeToAsciiUrl(self):
-        self.assertEquals(unicode_to_ascii_authority(u'www.Alliancefran\xe7aise.nu'), 'www.xn--alliancefranaise-npb.nu')
-        self.assertEquals(
+        self.assertEqual(unicode_to_ascii_authority(u'www.Alliancefran\xe7aise.nu'), 'www.xn--alliancefranaise-npb.nu')
+        self.assertEqual(
            unicode_to_ascii_authority(u'www.benn.ca'), 'www.benn.ca')
        self.assertRaises(UnicodeError, unicode_to_ascii_authority,
                          u'\u4e2d' * 1000)  # label is too long

    def testValidUrls(self):
-        self.assertEquals(self.x(u'www.Alliancefrancaise.nu'), (
+        self.assertEqual(self.x(u'www.Alliancefrancaise.nu'), (
            'http://www.Alliancefrancaise.nu', None))
-        self.assertEquals(self.x(u'www.Alliancefran\xe7aise.nu'), (
+        self.assertEqual(self.x(u'www.Alliancefran\xe7aise.nu'), (
            'http://www.xn--alliancefranaise-npb.nu', None))
-        self.assertEquals(self.x(u'www.Alliancefran\xe7aise.nu:8080'), (
+        self.assertEqual(self.x(u'www.Alliancefran\xe7aise.nu:8080'), (
            'http://www.xn--alliancefranaise-npb.nu:8080', None))
-        self.assertEquals(self.x(u'http://www.Alliancefran\xe7aise.nu'),
+        self.assertEqual(self.x(u'http://www.Alliancefran\xe7aise.nu'),
                          ('http://www.xn--alliancefranaise-npb.nu', None))
-        self.assertEquals(self.x(u'http://www.Alliancefran\xe7aise.nu/parnaise/blue'), ('http://www.xn--alliancefranaise-npb.nu/parnaise/blue', None))
-        self.assertEquals(self.x(u'http://www.Alliancefran\xe7aise.nu/parnaise/blue#fragment'), ('http://www.xn--alliancefranaise-npb.nu/parnaise/blue#fragment', None))
-        self.assertEquals(self.x(u'http://www.Alliancefran\xe7aise.nu/parnaise/blue?query=value#fragment'), ('http://www.xn--alliancefranaise-npb.nu/parnaise/blue?query=value#fragment', None))
-        self.assertEquals(self.x(u'http://www.Alliancefran\xe7aise.nu:8080/parnaise/blue?query=value#fragment'), ('http://www.xn--alliancefranaise-npb.nu:8080/parnaise/blue?query=value#fragment', None))
-        self.assertEquals(self.x(u'www.Alliancefran\xe7aise.nu/parnaise/blue?query=value#fragment'), ('http://www.xn--alliancefranaise-npb.nu/parnaise/blue?query=value#fragment', None))
-        self.assertEquals(self.x(
+        self.assertEqual(self.x(u'http://www.Alliancefran\xe7aise.nu/parnaise/blue'), ('http://www.xn--alliancefranaise-npb.nu/parnaise/blue', None))
+        self.assertEqual(self.x(u'http://www.Alliancefran\xe7aise.nu/parnaise/blue#fragment'), ('http://www.xn--alliancefranaise-npb.nu/parnaise/blue#fragment', None))
+        self.assertEqual(self.x(u'http://www.Alliancefran\xe7aise.nu/parnaise/blue?query=value#fragment'), ('http://www.xn--alliancefranaise-npb.nu/parnaise/blue?query=value#fragment', None))
+        self.assertEqual(self.x(u'http://www.Alliancefran\xe7aise.nu:8080/parnaise/blue?query=value#fragment'), ('http://www.xn--alliancefranaise-npb.nu:8080/parnaise/blue?query=value#fragment', None))
+        self.assertEqual(self.x(u'www.Alliancefran\xe7aise.nu/parnaise/blue?query=value#fragment'), ('http://www.xn--alliancefranaise-npb.nu/parnaise/blue?query=value#fragment', None))
+        self.assertEqual(self.x(
            u'http://\u4e2d\u4fd4.com'), ('http://xn--fiq13b.com', None))
-        self.assertEquals(self.x(u'http://\u4e2d\u4fd4.com/\u4e86'),
+        self.assertEqual(self.x(u'http://\u4e2d\u4fd4.com/\u4e86'),
                          ('http://xn--fiq13b.com/%4e%86', None))
-        self.assertEquals(self.x(u'http://\u4e2d\u4fd4.com/\u4e86?query=\u4e86'), ('http://xn--fiq13b.com/%4e%86?query=%4e%86', None))
-        self.assertEquals(self.x(u'http://\u4e2d\u4fd4.com/\u4e86?query=\u4e86#fragment'), ('http://xn--fiq13b.com/%4e%86?query=%4e%86#fragment', None))
-        self.assertEquals(self.x(u'http://\u4e2d\u4fd4.com?query=\u4e86#fragment'), ('http://xn--fiq13b.com?query=%4e%86#fragment', None))
-        self.assertEquals(
+        self.assertEqual(self.x(u'http://\u4e2d\u4fd4.com/\u4e86?query=\u4e86'), ('http://xn--fiq13b.com/%4e%86?query=%4e%86', None))
+        self.assertEqual(self.x(u'http://\u4e2d\u4fd4.com/\u4e86?query=\u4e86#fragment'), ('http://xn--fiq13b.com/%4e%86?query=%4e%86#fragment', None))
+        self.assertEqual(self.x(u'http://\u4e2d\u4fd4.com?query=\u4e86#fragment'), ('http://xn--fiq13b.com?query=%4e%86#fragment', None))
+        self.assertEqual(
            self.x(u'http://B\xfccher.ch'), ('http://xn--bcher-kva.ch', None))
-        self.assertEquals(self.x(u'http://\xe4\xf6\xfc\xdf.com'), (
+        self.assertEqual(self.x(u'http://\xe4\xf6\xfc\xdf.com'), (
            'http://xn--ss-uia6e4a.com', None))
-        self.assertEquals(self.x(
+        self.assertEqual(self.x(
            u'http://visegr\xe1d.com'), ('http://xn--visegrd-mwa.com', None))
-        self.assertEquals(self.x(u'http://h\xe1zipatika.com'), (
+        self.assertEqual(self.x(u'http://h\xe1zipatika.com'), (
            'http://xn--hzipatika-01a.com', None))
-        self.assertEquals(self.x(u'http://www.\xe7ukurova.com'), (
+        self.assertEqual(self.x(u'http://www.\xe7ukurova.com'), (
            'http://www.xn--ukurova-txa.com', None))
-        self.assertEquals(self.x(u'http://nixier\xf6hre.nixieclock-tube.com'), ('http://xn--nixierhre-57a.nixieclock-tube.com', None))
-        self.assertEquals(self.x(u'google.ca.'), ('http://google.ca.', None))
+        self.assertEqual(self.x(u'http://nixier\xf6hre.nixieclock-tube.com'), ('http://xn--nixierhre-57a.nixieclock-tube.com', None))
+        self.assertEqual(self.x(u'google.ca.'), ('http://google.ca.', None))

-        self.assertEquals(
+        self.assertEqual(
            self.y(u'https://google.ca'), ('https://google.ca', None))
-        self.assertEquals(self.y(
+        self.assertEqual(self.y(
            u'https://\u4e2d\u4fd4.com'), ('https://xn--fiq13b.com', None))

-        self.assertEquals(self.z(u'google.ca'), ('google.ca', None))
+        self.assertEqual(self.z(u'google.ca'), ('google.ca', None))

    def testInvalidUrls(self):
-        self.assertEquals(
+        self.assertEqual(
            self.x(u'://ABC.com'), (u'://ABC.com', 'Enter a valid URL'))
-        self.assertEquals(self.x(u'http://\u4e2d\u4fd4.dne'), (
+        self.assertEqual(self.x(u'http://\u4e2d\u4fd4.dne'), (
            u'http://\u4e2d\u4fd4.dne', 'Enter a valid URL'))
-        self.assertEquals(self.x(u'https://google.dne'), (
+        self.assertEqual(self.x(u'https://google.dne'), (
            u'https://google.dne', 'Enter a valid URL'))
-        self.assertEquals(self.x(u'https://google..ca'), (
+        self.assertEqual(self.x(u'https://google..ca'), (
            u'https://google..ca', 'Enter a valid URL'))
-        self.assertEquals(
+        self.assertEqual(
            self.x(u'google..ca'), (u'google..ca', 'Enter a valid URL'))
-        self.assertEquals(self.x(u'http://' + u'\u4e2d' * 1000 + u'.com'), (
+        self.assertEqual(self.x(u'http://' + u'\u4e2d' * 1000 + u'.com'), (
            u'http://' + u'\u4e2d' * 1000 + u'.com', 'Enter a valid URL'))

-        self.assertEquals(self.x(u'http://google.com#fragment_\u4e86'), (
+        self.assertEqual(self.x(u'http://google.com#fragment_\u4e86'), (
            u'http://google.com#fragment_\u4e86', 'Enter a valid URL'))
-        self.assertEquals(self.x(u'http\u4e86://google.com'), (
+        self.assertEqual(self.x(u'http\u4e86://google.com'), (
            u'http\u4e86://google.com', 'Enter a valid URL'))
-        self.assertEquals(self.x(u'http\u4e86://google.com#fragment_\u4e86'), (
+        self.assertEqual(self.x(u'http\u4e86://google.com#fragment_\u4e86'), (
            u'http\u4e86://google.com#fragment_\u4e86', 'Enter a valid URL'))

-        self.assertEquals(self.y(u'http://\u4e2d\u4fd4.com/\u4e86'), (
+        self.assertEqual(self.y(u'http://\u4e2d\u4fd4.com/\u4e86'), (
            u'http://\u4e2d\u4fd4.com/\u4e86', 'Enter a valid URL'))
-        #self.assertEquals(self.y(u'google.ca'), (u'google.ca', 'Enter a valid URL'))
+        #self.assertEqual(self.y(u'google.ca'), (u'google.ca', 'Enter a valid URL'))

-        self.assertEquals(self.z(u'invalid.domain..com'), (
+        self.assertEqual(self.z(u'invalid.domain..com'), (
            u'invalid.domain..com', 'Enter a valid URL'))
-        self.assertEquals(self.z(u'invalid.\u4e2d\u4fd4.blargg'), (
+        self.assertEqual(self.z(u'invalid.\u4e2d\u4fd4.blargg'), (
            u'invalid.\u4e2d\u4fd4.blargg', 'Enter a valid URL'))

 # ##############################################################################
--- a/gluon/validators.py
+++ b/gluon/validators.py
@@ -10,7 +10,6 @@
 Validators
 -----------
 """
-
 import os
 import re
 import datetime
@@ -21,7 +20,7 @@ import urllib
 import struct
 import decimal
 import unicodedata
-from gluon._compat import StringIO, long, basestring, unicodeT, to_unicode, urllib_unquote, unichr, to_bytes, PY2, to_unicode, to_native
+from gluon._compat import StringIO, long, basestring, unicodeT, to_unicode, urllib_unquote, unichr, to_bytes, PY2, to_unicode, to_native, urlparse
 from gluon.utils import simple_hash, web2py_uuid, DIGEST_ALG_BY_SIZE
 from pydal.objects import Field, FieldVirtual, FieldMethod
 from functools import reduce
@@ -195,7 +194,7 @@ class IS_MATCH(Validator):
        self.is_unicode = is_unicode or (not(PY2))

    def __call__(self, value):
-        if not(PY2): # PY3 convert bytes to unicode
+        if not(PY2):  # PY3 convert bytes to unicode
            value = to_unicode(value)

        if self.is_unicode or not(PY2):
@@ -270,7 +269,7 @@ class IS_EXPR(Validator):
            return (value, self.expression(value))
        # for backward compatibility
        self.environment.update(value=value)
-        exec ('__ret__=' + self.expression, self.environment)
+        exec('__ret__=' + self.expression, self.environment)
        if self.environment['__ret__']:
            return (value, None)
        return (value, translate(self.error_message))
@@ -1185,7 +1184,6 @@ class IS_EMAIL(Validator):

    regex_proposed_but_failed = re.compile('^([\w\!\#$\%\&\'\*\+\-\/\=\?\^\`{\|\}\~]+\.)*[\w\!\#$\%\&\'\*\+\-\/\=\?\^\`{\|\}\~]+@((((([a-z0-9]{1}[a-z0-9\-]{0,62}[a-z0-9]{1})|[a-z])\.)+[a-z]{2,6})|(\d{1,3}\.){3}\d{1,3}(\:\d{1,5})?)$', re.VERBOSE | re.IGNORECASE)

-
    def __init__(self,
                 banned=None,
                 forced=None,
@@ -1200,7 +1198,7 @@ class IS_EMAIL(Validator):

    def __call__(self, value):
        if not(isinstance(value, (basestring, unicodeT))) or not value or '@' not in value:
-          return (value, translate(self.error_message))
+            return (value, translate(self.error_message))

        body, domain = value.rsplit('@', 1)

@@ -1388,19 +1386,6 @@ unofficial_url_schemes = [
 all_url_schemes = [None] + official_url_schemes + unofficial_url_schemes
 http_schemes = [None, 'http', 'https']

-
-# This regex comes from RFC 2396, Appendix B. It's used to split a URL into
-# its component parts
-# Here are the regex groups that it extracts:
-#    scheme = group(2)
-#    authority = group(4)
-#    path = group(5)
-#    query = group(7)
-#    fragment = group(9)
-
-url_split_regex = \
-    re.compile('^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?')
-
 # Defined in RFC 3490, Section 3.1, Requirement #1
 # Use this regex to split the authority component of a unicode URL into
 # its component labels
@@ -1470,18 +1455,15 @@ def unicode_to_ascii_authority(authority):
    # We use the ToASCII operation because we are about to put the authority
    # into an IDN-unaware slot
    asciiLabels = []
-    try:
-        import encodings.idna
-        for label in labels:
-            if label:
-                asciiLabels.append(to_native(encodings.idna.ToASCII(label)))
-            else:
-                 # encodings.idna.ToASCII does not accept an empty string, but
-                 # it is necessary for us to allow for empty labels so that we
-                 # don't modify the URL
-                asciiLabels.append('')
-    except:
-        asciiLabels = [str(label) for label in labels]
+    import encodings.idna
+    for label in labels:
+        if label:
+            asciiLabels.append(to_native(encodings.idna.ToASCII(label)))
+        else:
+             # encodings.idna.ToASCII does not accept an empty string, but
+             # it is necessary for us to allow for empty labels so that we
+             # don't modify the URL
+            asciiLabels.append('')
    # RFC 3490, Section 4, Step 5
    return str(reduce(lambda x, y: x + unichr(0x002E) + y, asciiLabels))

@@ -1520,33 +1502,35 @@ def unicode_to_ascii_url(url, prepend_scheme):
    """
    # convert the authority component of the URL into an ASCII punycode string,
    # but encode the rest using the regular URI character encoding
-
-    groups = url_split_regex.match(url).groups()
+    components = urlparse.urlparse(url)
+    prepended = False
    # If no authority was found
-    if not groups[3]:
+    if not components.netloc:
        # Try appending a scheme to see if that fixes the problem
        scheme_to_prepend = prepend_scheme or 'http'
-        groups = url_split_regex.match(
-            to_unicode(scheme_to_prepend) + u'://' + url).groups()
+        components = urlparse.urlparse(to_unicode(scheme_to_prepend) + u'://' + url)
+        prepended = True
+
    # if we still can't find the authority
-    if not groups[3]:
+    if not components.netloc:
        raise Exception('No authority component found, ' +
                        'could not decode unicode to US-ASCII')

    # We're here if we found an authority, let's rebuild the URL
-    scheme = groups[1]
-    authority = groups[3]
-    path = groups[4] or ''
-    query = groups[5] or ''
-    fragment = groups[7] or ''
+    scheme = components.scheme
+    authority = components.netloc
+    path = components.path
+    query = components.query
+    fragment = components.fragment

-    if prepend_scheme:
-        scheme = str(scheme) + '://'
-    else:
+    if prepended:
        scheme = ''

-    return scheme + unicode_to_ascii_authority(authority) +\
-        escape_unicode(path) + escape_unicode(query) + str(fragment)
+    unparsed = urlparse.urlunparse((scheme, unicode_to_ascii_authority(authority), escape_unicode(path), '', escape_unicode(query), str(fragment)))
+    if unparsed.startswith('//'):
+        unparsed = unparsed[2:] # Remove the // urlunparse puts in the beginning
+    return unparsed
+


 class IS_GENERIC_URL(Validator):
@@ -1607,7 +1591,8 @@ class IS_GENERIC_URL(Validator):
                              % (self.prepend_scheme, self.allowed_schemes))

    GENERIC_URL = re.compile(r"%[^0-9A-Fa-f]{2}|%[^0-9A-Fa-f][0-9A-Fa-f]|%[0-9A-Fa-f][^0-9A-Fa-f]|%$|%[0-9A-Fa-f]$|%[^0-9A-Fa-f]$")
-    GENERIC_URL_VALID = re.compile(r"[A-Za-z0-9;/?:@&=+$,\-_\.!~*'\(\)%#]+$")
+    GENERIC_URL_VALID = re.compile(r"[A-Za-z0-9;/?:@&=+$,\-_\.!~*'\(\)%]+$")
+    URL_FRAGMENT_VALID = re.compile(r"[|A-Za-z0-9;/?:@&=+$,\-_\.!~*'\(\)%]+$")

    def __call__(self, value):
        """
@@ -1619,41 +1604,49 @@ class IS_GENERIC_URL(Validator):
            prepended with prepend_scheme), and tuple[1] is either
            None (success!) or the string error_message
        """
-        try:
-            # if the URL does not misuse the '%' character
-            if not self.GENERIC_URL.search(value):
-                # if the URL is only composed of valid characters
-                if self.GENERIC_URL_VALID.match(value):
-                    # Then split up the URL into its components and check on
-                    # the scheme
-                    scheme = url_split_regex.match(value).group(2)
-                    # Clean up the scheme before we check it
-                    if not scheme is None:
-                        scheme = urllib_unquote(scheme).lower()
-                    # If the scheme really exists
-                    if scheme in self.allowed_schemes:
-                        # Then the URL is valid
-                        return (value, None)
-                    else:
-                        # else, for the possible case of abbreviated URLs with
-                        # ports, check to see if adding a valid scheme fixes
-                        # the problem (but only do this if it doesn't have
-                        # one already!)
-                        if value.find('://') < 0 and None in self.allowed_schemes:
-                            schemeToUse = self.prepend_scheme or 'http'
-                            prependTest = self.__call__(
-                                schemeToUse + '://' + value)
-                            # if the prepend test succeeded
-                            if prependTest[1] is None:
-                                # if prepending in the output is enabled
-                                if self.prepend_scheme:
-                                    return prependTest
-                                else:
-                                    # else return the original,
-                                    #  non-prepended value
-                                    return (value, None)
-        except:
-            pass
+
+        # if we dont have anything or the URL misuses the '%' character
+
+        if not value or self.GENERIC_URL.search(value):
+            return (value, translate(self.error_message))
+
+        if '#' in value:
+            url, fragment_part = value.split('#')
+        else:
+            url, fragment_part = value, ''
+        # if the URL is only composed of valid characters
+        if self.GENERIC_URL_VALID.match(url) and (not fragment_part or self.URL_FRAGMENT_VALID.match(fragment_part)):
+            # Then parse the URL into its components and check on
+            try:
+                components = urlparse.urlparse(urllib_unquote(value))._asdict()
+            except ValueError:
+                return (value, translate(self.error_message))
+
+            # Clean up the scheme before we check it
+            scheme = components['scheme']
+            if len(scheme) == 0:
+                scheme = None
+            else:
+                scheme = components['scheme'].lower()
+            # If the scheme doesn't really exists
+            if scheme not in self.allowed_schemes or not scheme and ':' in components['path']:
+                # for the possible case of abbreviated URLs with
+                # ports, check to see if adding a valid scheme fixes
+                # the problem (but only do this if it doesn't have
+                # one already!)
+                if '://' not in value and None in self.allowed_schemes:
+                    schemeToUse = self.prepend_scheme or 'http'
+                    prependTest = self.__call__(
+                        schemeToUse + '://' + value)
+                    # if the prepend test succeeded
+                    if prependTest[1] is None:
+                        # if prepending in the output is enabled
+                        if self.prepend_scheme:
+                            return prependTest
+                        else:
+                            return (value, None)
+            else:
+                return (value, None)
        # else the URL is not valid
        return (value, translate(self.error_message))

@@ -1920,15 +1913,14 @@ class IS_HTTP_URL(Validator):
            (possible prepended with prepend_scheme), and tuple[1] is either
            None (success!) or the string error_message
        """
-
        try:
            # if the URL passes generic validation
            x = IS_GENERIC_URL(error_message=self.error_message,
                               allowed_schemes=self.allowed_schemes,
                               prepend_scheme=self.prepend_scheme)
            if x(value)[1] is None:
-                componentsMatch = url_split_regex.match(value)
-                authority = componentsMatch.group(4)
+                components = urlparse.urlparse(value)
+                authority = components.netloc
                # if there is an authority component
                if authority:
                    # if authority is a valid IP address
@@ -1948,7 +1940,7 @@ class IS_HTTP_URL(Validator):
                else:
                    # else this is a relative/abbreviated URL, which will parse
                    # into the URL's path component
-                    path = componentsMatch.group(5)
+                    path = components.path
                    # relative case: if this is a valid path (if it starts with
                    # a slash)
                    if path.startswith('/'):
@@ -1957,7 +1949,7 @@ class IS_HTTP_URL(Validator):
                    else:
                        # abbreviated case: if we haven't already, prepend a
                        # scheme and see if it fixes the problem
-                        if value.find('://') < 0:
+                        if '://' not in value and None in self.allowed_schemes:
                            schemeToUse = self.prepend_scheme or 'http'
                            prependTest = self.__call__(schemeToUse
                                                        + '://' + value)
@@ -2124,7 +2116,6 @@ class IS_URL(Validator):
                # If we are not able to convert the unicode url into a
                # US-ASCII URL, then the URL is not valid
                return (value, translate(self.error_message))
-
            methodResult = subMethod(asciiValue)
            # if the validation of the US-ASCII version of the value failed
            if not methodResult[1] is None:
@@ -2494,6 +2485,7 @@ class IS_LOWER(Validator):
        ('\\xc3\\xb1', None)

    """
+
    def __call__(self, value):
        cast_back = lambda x: x
        if isinstance(value, str):