// Package uri is meant to be an RFC 3986 compliant URI builder and parser. // // This is based on the work from ttacon/uri (credits: Trey Tacon). // // This fork concentrates on RFC 3986 strictness for URI parsing and validation. // // Reference: https://tools.ietf.org/html/rfc3986 // // Tests have been augmented with test suites of URI validators in other languages: // perl, python, scala, .Net. // // Extra features like MySQL URIs present in the original repo have been removed. package uri import ( "errors" "fmt" "io" "net/netip" "net/url" "strings" "unicode" ) // URI represents a general RFC3986 URI. type URI interface { // Scheme the URI conforms to. Scheme() string // Authority information for the URI, including the "//" prefix. Authority() Authority // Query returns a map of key/value pairs of all parameters // in the query string of the URI. Query() url.Values // Fragment returns the fragment (component preceded by '#') in the // URI if there is one. Fragment() string // Builder returns a Builder that can be used to modify the URI. Builder() Builder // String representation of the URI String() string // Validate the different components of the URI Validate() error } // Authority information that a URI contains // as specified by RFC3986. // // Username and password are given by UserInfo(). type Authority interface { UserInfo() string Host() string Port() string Path() string String() string Validate(...string) error } const ( // char and string literals. colonMark = ':' questionMark = '?' fragmentMark = '#' percentMark = '%' atHost = '@' slashMark = '/' openingBracketMark = '[' closingBracketMark = ']' authorityPrefix = "//" ) var ( // predefined sets of accecpted runes beyond the "unreserved" character set pcharExtraRunes = []rune{':', '@'} // pchar = unreserved | ':' | '@' queryOrFragmentExtraRunes = append(pcharExtraRunes, '/', '?') userInfoExtraRunes = append(pcharExtraRunes, ':') ) // IsURI tells if a URI is valid according to RFC3986/RFC397. func IsURI(raw string) bool { _, err := Parse(raw) return err == nil } // IsURIReference tells if a URI reference is valid according to RFC3986/RFC397 // // Reference: https://www.rfc-editor.org/rfc/rfc3986#section-4.1 and // https://www.rfc-editor.org/rfc/rfc3986#section-4.2 func IsURIReference(raw string) bool { _, err := ParseReference(raw) return err == nil } // Parse attempts to parse a URI. // It returns an error if the URI is not RFC3986-compliant. func Parse(raw string) (URI, error) { return parse(raw, false) } // ParseReference attempts to parse a URI relative reference. // // It returns an error if the URI is not RFC3986-compliant. func ParseReference(raw string) (URI, error) { return parse(raw, true) } func parse(raw string, withURIReference bool) (URI, error) { var ( scheme string curr int ) schemeEnd := strings.IndexByte(raw, colonMark) // position of a ":" hierPartEnd := strings.IndexByte(raw, questionMark) // position of a "?" queryEnd := strings.IndexByte(raw, fragmentMark) // position of a "#" // exclude pathological input if schemeEnd == 0 || hierPartEnd == 0 || queryEnd == 0 { // ":", "?", "#" return nil, ErrInvalidURI } if schemeEnd == 1 { return nil, errorsJoin( ErrInvalidScheme, fmt.Errorf("scheme has a minimum length of 2 characters"), ) } if hierPartEnd == 1 || queryEnd == 1 { // ".:", ".?", ".#" return nil, ErrInvalidURI } if hierPartEnd > 0 && hierPartEnd < schemeEnd || queryEnd > 0 && queryEnd < schemeEnd { // e.g. htt?p: ; h#ttp: .. return nil, ErrInvalidURI } if queryEnd > 0 && queryEnd < hierPartEnd { // e.g. https://abc#a?b hierPartEnd = queryEnd } isRelative := strings.HasPrefix(raw, authorityPrefix) switch { case schemeEnd > 0 && !isRelative: scheme = raw[curr:schemeEnd] if schemeEnd+1 == len(raw) { // trailing ':' (e.g. http:) u := &uri{ scheme: scheme, } return u, u.Validate() } case !withURIReference: // scheme is required for URI return nil, errorsJoin( ErrNoSchemeFound, fmt.Errorf("for URI (not URI reference), the scheme is required"), ) case isRelative: // scheme is optional for URI references. // // start with // and a ':' is following... e.g //example.com:8080/path schemeEnd = -1 } curr = schemeEnd + 1 if hierPartEnd == len(raw)-1 || (hierPartEnd < 0 && queryEnd < 0) { // trailing ? or (no query & no fragment) if hierPartEnd < 0 { hierPartEnd = len(raw) } authorityInfo, err := parseAuthority(raw[curr:hierPartEnd]) if err != nil { return nil, errorsJoin(ErrInvalidURI, err) } u := &uri{ scheme: scheme, hierPart: raw[curr:hierPartEnd], authority: authorityInfo, } return u, u.Validate() } var ( hierPart, query, fragment string authorityInfo authorityInfo err error ) if hierPartEnd > 0 { hierPart = raw[curr:hierPartEnd] authorityInfo, err = parseAuthority(hierPart) if err != nil { return nil, errorsJoin(ErrInvalidURI, err) } if hierPartEnd+1 < len(raw) { if queryEnd < 0 { // query ?, no fragment query = raw[hierPartEnd+1:] } else if hierPartEnd < queryEnd-1 { // query ?, fragment query = raw[hierPartEnd+1 : queryEnd] } } curr = hierPartEnd + 1 } if queryEnd == len(raw)-1 && hierPartEnd < 0 { // trailing #, no query "?" hierPart = raw[curr:queryEnd] authorityInfo, err = parseAuthority(hierPart) if err != nil { return nil, errorsJoin(ErrInvalidURI, err) } u := &uri{ scheme: scheme, hierPart: hierPart, authority: authorityInfo, query: query, } return u, u.Validate() } if queryEnd > 0 { // there is a fragment if hierPartEnd < 0 { // no query hierPart = raw[curr:queryEnd] authorityInfo, err = parseAuthority(hierPart) if err != nil { return nil, errorsJoin(ErrInvalidURI, err) } } if queryEnd+1 < len(raw) { fragment = raw[queryEnd+1:] } } u := &uri{ scheme: scheme, hierPart: hierPart, query: query, fragment: fragment, authority: authorityInfo, } return u, u.Validate() } type uri struct { // raw components scheme string hierPart string query string fragment string // parsed components authority authorityInfo } func (u *uri) URI() URI { return u } func (u *uri) Scheme() string { return u.scheme } func (u *uri) Authority() Authority { u.ensureAuthorityExists() return u.authority } // Query returns parsed query parameters like standard lib URL.Query(). func (u *uri) Query() url.Values { v, _ := url.ParseQuery(u.query) return v } func (u *uri) Fragment() string { return u.fragment } func isNumerical(input string) bool { return strings.IndexFunc(input, func(r rune) bool { return r < '0' || r > '9' }, ) == -1 } // Validate checks that all parts of a URI abide by allowed characters. func (u *uri) Validate() error { if u.scheme != "" { if err := u.validateScheme(u.scheme); err != nil { return err } } if u.query != "" { if err := u.validateQuery(u.query); err != nil { return err } } if u.fragment != "" { if err := u.validateFragment(u.fragment); err != nil { return err } } if u.hierPart != "" { return u.Authority().Validate(u.scheme) } // empty hierpart case return nil } // validateScheme verifies the correctness of the scheme part. // // Reference: https://www.rfc-editor.org/rfc/rfc3986#section-3.1 // // scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) // // NOTE: scheme is not supposed to contain any percent-encoded sequence. // // TODO(fredbi): verify the IRI RFC to check if unicode is allowed in scheme. func (u *uri) validateScheme(scheme string) error { if len(scheme) < 2 { return ErrInvalidScheme } for i, r := range scheme { if i == 0 { if !unicode.IsLetter(r) { return ErrInvalidScheme } continue } if !unicode.IsLetter(r) && !unicode.IsDigit(r) && r != '+' && r != '-' && r != '.' { return ErrInvalidScheme } } return nil } // validateQuery validates the query part. // // Reference: https://www.rfc-editor.org/rfc/rfc3986#section-3.4 // // pchar = unreserved / pct-encoded / sub-delims / ":" / "@" // query = *( pchar / "/" / "?" ) func (u *uri) validateQuery(query string) error { if err := validateUnreservedWithExtra(query, queryOrFragmentExtraRunes); err != nil { return errorsJoin(ErrInvalidQuery, err) } return nil } // validateFragment validatesthe fragment part. // // Reference: https://www.rfc-editor.org/rfc/rfc3986#section-3.5 // // pchar = unreserved / pct-encoded / sub-delims / ":" / "@" // // fragment = *( pchar / "/" / "?" ) func (u *uri) validateFragment(fragment string) error { if err := validateUnreservedWithExtra(fragment, queryOrFragmentExtraRunes); err != nil { return errorsJoin(ErrInvalidFragment, err) } return nil } func validateUnreservedWithExtra(s string, acceptedRunes []rune) error { skip := 0 for i, r := range s { if skip > 0 { skip-- continue } // accepts percent-encoded sequences if r == '%' { if i+2 >= len(s) || !isHex(s[i+1]) || !isHex(s[i+2]) { return fmt.Errorf("part %q contains a malformed percent-encoded part near [%s...]", s, s[:i]) } skip = 2 continue } // RFC grammar definitions: // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" // / "*" / "+" / "," / ";" / "=" // gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" // pchar = unreserved / pct-encoded / sub-delims / ":" / "@" if !unicode.IsLetter(r) && !unicode.IsDigit(r) && // unreserved r != '-' && r != '.' && r != '_' && r != '~' && // sub-delims r != '!' && r != '$' && r != '&' && r != '\'' && r != '(' && r != ')' && r != '*' && r != '+' && r != ',' && r != ';' && r != '=' { runeFound := false for _, acceptedRune := range acceptedRunes { if r == acceptedRune { runeFound = true break } } if !runeFound { return fmt.Errorf("%q contains an invalid character: '%U' (%q)", s, r, r) } } } return nil } func isHex[T byte | rune](c T) bool { switch { case '0' <= c && c <= '9': return true case 'a' <= c && c <= 'f': return true case 'A' <= c && c <= 'F': return true } return false } // validateIPvFuture covers the special provision in the RFC for future IP scheme. // The passed argument removes the heading "v" character. // // Example: http://[v6.fe80::a_en1] // // Reference: https://www.rfc-editor.org/rfc/rfc3986#section-3.2.2 // // IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) func validateIPvFuture(address string) error { rr := strings.NewReader(address) var ( foundHexDigits, foundDot bool ) for { r, _, err := rr.ReadRune() if err == io.EOF { break } if r == '.' { foundDot = true break } if !isHex(r) { return errors.New( "invalid IP vFuture format: expect an hexadecimal version tag", ) } foundHexDigits = true } if !foundHexDigits || !foundDot { return errors.New( "invalid IP vFuture format: expect a '.' after an hexadecimal version tag", ) } if rr.Len() == 0 { return errors.New("invalid IP vFuture format: expect a non-empty address after the version tag") } offset, _ := rr.Seek(0, io.SeekCurrent) return validateUnreservedWithExtra(address[offset:], userInfoExtraRunes) } type authorityInfo struct { prefix string userinfo string host string port string path string isIPv6 bool } func (a authorityInfo) UserInfo() string { return a.userinfo } func (a authorityInfo) Host() string { return a.host } func (a authorityInfo) Port() string { return a.port } func (a authorityInfo) Path() string { return a.path } func (a authorityInfo) String() string { buf := strings.Builder{} buf.WriteString(a.prefix) buf.WriteString(a.userinfo) if len(a.userinfo) > 0 { buf.WriteByte(atHost) } if a.isIPv6 { buf.WriteString("[" + a.host + "]") } else { buf.WriteString(a.host) } if len(a.port) > 0 { buf.WriteByte(colonMark) } buf.WriteString(a.port) buf.WriteString(a.path) return buf.String() } // Validate the Authority part. // // Reference: https://www.rfc-editor.org/rfc/rfc3986#section-3.2 func (a authorityInfo) Validate(schemes ...string) error { if a.path != "" { if err := a.validatePath(a.path); err != nil { return err } } if a.host != "" { if err := a.validateHost(a.host, a.isIPv6, schemes...); err != nil { return err } } if a.port != "" { if err := a.validatePort(a.port, a.host); err != nil { return err } } if a.userinfo != "" { if err := a.validateUserInfo(a.userinfo); err != nil { return err } } return nil } // validatePath validates the path part. // // Reference: https://www.rfc-editor.org/rfc/rfc3986#section-3.3 func (a authorityInfo) validatePath(path string) error { if a.host == "" && a.port == "" && len(path) >= 2 && path[0] == '/' && path[1] == '/' { return errorsJoin( ErrInvalidPath, fmt.Errorf( `if a URI does not contain an authority component, then the path cannot begin with two slash characters ("//"): %q`, a.path, )) } // NOTE: this loop used to be neatly written with strings.Split(). // However, analysis showed that constantly allocating the returned slice // was a significant burden on the gc (at least when compared to the workload // the rest of this module generates). var previousPos int for pos, char := range path { if char != '/' { continue } if pos > previousPos { if err := validateUnreservedWithExtra(path[previousPos:pos], pcharExtraRunes); err != nil { return errorsJoin( ErrInvalidPath, err, ) } } previousPos = pos + 1 } if previousPos < len(path) { // don't care if the last char was a separator if err := validateUnreservedWithExtra(path[previousPos:], pcharExtraRunes); err != nil { return errorsJoin( ErrInvalidPath, err, ) } } return nil } // validateHost validates the host part. // // Reference: https://www.rfc-editor.org/rfc/rfc3986#section-3.2.2 func (a authorityInfo) validateHost(host string, isIPv6 bool, schemes ...string) error { var unescapedHost string if strings.ContainsRune(host, '%') { // only proceed with PathUnescape if we need to (saves an alloc otherwise) var err error unescapedHost, err = url.PathUnescape(host) if err != nil { return errorsJoin( ErrInvalidHost, fmt.Errorf("invalid percent-encoding in the host part"), ) } } else { unescapedHost = host } if isIPv6 { return validateIPv6(unescapedHost) } // check for IPv4 address // // The host SHOULD check // the string syntactically for a dotted-decimal number before // looking it up in the Domain Name System. // IPv4 may contain percent-encoded escaped characters, e.g. 192.168.0.%31 is valid. // Reference: https://www.rfc-editor.org/rfc/rfc3986#appendix-A // // IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet // dec-octet = DIGIT ; 0-9 // / %x31-39 DIGIT ; 10-99 // / "1" 2DIGIT ; 100-199 // / "2" %x30-34 DIGIT ; 200-249 // / "25" %x30-35 ; 250-255 if addr, err := netip.ParseAddr(unescapedHost); err == nil { if !addr.Is4() { return errorsJoin( ErrInvalidHostAddress, fmt.Errorf("a host as an address, without square brackets, should refer to an IPv4 address: %q", host), ) } return nil } // this is not an IP, check for host DNS or registered name if err := validateHostForScheme(host, unescapedHost, schemes...); err != nil { return errorsJoin( ErrInvalidHost, err, ) } return nil } func validateIPv6(unescapedHost string) error { // address the provision made in the RFC for a "IPvFuture" if unescapedHost[0] == 'v' || unescapedHost[0] == 'V' { if err := validateIPvFuture(unescapedHost[1:]); err != nil { return errorsJoin( ErrInvalidHostAddress, err, ) } return nil } // check for IPv6 address // IPv6 may contain percent-encoded escaped characters addr, err := netip.ParseAddr(unescapedHost) if err != nil { // RFC3986 stipulates that only IPv6 addresses are within square brackets return errorsJoin( ErrInvalidHostAddress, fmt.Errorf("a square-bracketed host part should be a valid IPv6 address: %q", unescapedHost), ) } if !addr.Is6() { return errorsJoin( ErrInvalidHostAddress, fmt.Errorf("a square-bracketed host part should not contain an IPv4 address: %q", unescapedHost), ) } return nil } // validateHostForScheme validates the host according to 2 different sets of rules: // - if the scheme is a scheme well-known for using DNS host names, the DNS host validation applies (RFC) // (applies to schemes at: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml) // - otherwise, applies the "registered-name" validation stated by RFC 3986: // // dns-name see: https://www.rfc-editor.org/rfc/rfc1034, https://www.rfc-editor.org/info/rfc5890 // reg-name = *( unreserved / pct-encoded / sub-delims ) func validateHostForScheme(host, unescapedHost string, schemes ...string) error { for _, scheme := range schemes { if UsesDNSHostValidation(scheme) { if err := validateDNSHostForScheme(unescapedHost); err != nil { return err } } if err := validateRegisteredHostForScheme(host); err != nil { return err } } return nil } func validateDNSHostForScheme(unescapedHost string) error { // DNS name if len(unescapedHost) > 255 { // warning: size in bytes, not in runes (existing bug, or is it really?) -- TODO(fredbi) return errorsJoin( ErrInvalidDNSName, fmt.Errorf("hostname is longer than the allowed 255 characters"), ) } /* ::= | " " ::=