Replace linkRegex with xurls library (#6261)
* Replace linkRegex with xurls library Rather than maintaining a complicated regex to match URLs for autolinking, gitea can use this existing go library that takes care of the matching with very little code change to gitea itself. After spending a while trying to find the perfect regex for all cases this library still works better as it is more flexible than a single regex ever will be. This will also fix the following issues: #5844 #3095 #3381 This passes all our current tests and I've added new ones mentioned in those issues as well. * Use xurls.StrictMatchingScheme instead of xurls.Strict This is much faster and we only care about https? links to preserve existing behavior.
This commit is contained in:
		
							parent
							
								
									01bd1fcd33
								
							
						
					
					
						commit
						f2de5dc8c8
					
				| 
						 | 
				
			
			@ -725,6 +725,14 @@
 | 
			
		|||
  pruneopts = "NUT"
 | 
			
		||||
  revision = "02ccfbfaf0cc627aa3aec8ef7ed5cfeec5b43f63"
 | 
			
		||||
 | 
			
		||||
[[projects]]
 | 
			
		||||
  digest = "1:63953ffb90bbc880c612d576fcfd973a5904277d25ec9e2d8d5719bf67969662"
 | 
			
		||||
  name = "github.com/mvdan/xurls"
 | 
			
		||||
  packages = ["."]
 | 
			
		||||
  pruneopts = "NUT"
 | 
			
		||||
  revision = "e52e821cbfe8fe163ff6f8628ab5869b11fc05af"
 | 
			
		||||
  version = "v2.0.0"
 | 
			
		||||
 | 
			
		||||
[[projects]]
 | 
			
		||||
  digest = "1:2be1d891535ce3d6d2a3db9087f07415e909744e9eff1a30f8f0b2519df60ae6"
 | 
			
		||||
  name = "github.com/nfnt/resize"
 | 
			
		||||
| 
						 | 
				
			
			@ -1293,6 +1301,7 @@
 | 
			
		|||
    "github.com/mcuadros/go-version",
 | 
			
		||||
    "github.com/microcosm-cc/bluemonday",
 | 
			
		||||
    "github.com/msteinert/pam",
 | 
			
		||||
    "github.com/mvdan/xurls",
 | 
			
		||||
    "github.com/nfnt/resize",
 | 
			
		||||
    "github.com/pquerna/otp",
 | 
			
		||||
    "github.com/pquerna/otp/totp",
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -113,3 +113,7 @@ ignored = ["google.golang.org/appengine*"]
 | 
			
		|||
[[constraint]]
 | 
			
		||||
  name = "github.com/prometheus/client_golang"
 | 
			
		||||
  version = "0.9.0"
 | 
			
		||||
 | 
			
		||||
[[constraint]]
 | 
			
		||||
  name = "github.com/mvdan/xurls"
 | 
			
		||||
  version = "2.0.0"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -17,6 +17,7 @@ import (
 | 
			
		|||
	"code.gitea.io/gitea/modules/util"
 | 
			
		||||
 | 
			
		||||
	"github.com/Unknwon/com"
 | 
			
		||||
	"github.com/mvdan/xurls"
 | 
			
		||||
	"golang.org/x/net/html"
 | 
			
		||||
	"golang.org/x/net/html/atom"
 | 
			
		||||
)
 | 
			
		||||
| 
						 | 
				
			
			@ -64,9 +65,7 @@ var (
 | 
			
		|||
	//   https://html.spec.whatwg.org/multipage/input.html#e-mail-state-(type%3Demail)
 | 
			
		||||
	emailRegex = regexp.MustCompile("[a-zA-Z0-9.!#$%&'*+\\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*")
 | 
			
		||||
 | 
			
		||||
	// matches http/https links. used for autlinking those. partly modified from
 | 
			
		||||
	// the original present in autolink.js
 | 
			
		||||
	linkRegex = regexp.MustCompile(`(?:(?:http|https):\/\/(?:[\-;:&=\+\$,\w]+@)?[A-Za-z0-9\.\-]+(?:\.|[\-;:&=\+\$,\w]+@)[A-Za-z0-9\.\-]+)(?:(?:\/[\+~%\/\.\w\-]*)?\??(?:[\-\+:=&;%@\.\w]*)#?(?:[\.\!\/\\\w]*))?`)
 | 
			
		||||
	linkRegex, _ = xurls.StrictMatchingScheme("https?://")
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
// regexp for full links to issues/pulls
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -104,6 +104,15 @@ func TestRender_links(t *testing.T) {
 | 
			
		|||
	test(
 | 
			
		||||
		"http://142.42.1.1/",
 | 
			
		||||
		`<p><a href="http://142.42.1.1/" rel="nofollow">http://142.42.1.1/</a></p>`)
 | 
			
		||||
	test(
 | 
			
		||||
		"https://github.com/go-gitea/gitea/?p=aaa/bbb.html#ccc-ddd",
 | 
			
		||||
		`<p><a href="https://github.com/go-gitea/gitea/?p=aaa/bbb.html#ccc-ddd" rel="nofollow">https://github.com/go-gitea/gitea/?p=aaa/bbb.html#ccc-ddd</a></p>`)
 | 
			
		||||
	test(
 | 
			
		||||
		"https://en.wikipedia.org/wiki/URL_(disambiguation)",
 | 
			
		||||
		`<p><a href="https://en.wikipedia.org/wiki/URL_(disambiguation)" rel="nofollow">https://en.wikipedia.org/wiki/URL_(disambiguation)</a></p>`)
 | 
			
		||||
	test(
 | 
			
		||||
		"https://foo_bar.example.com/",
 | 
			
		||||
		`<p><a href="https://foo_bar.example.com/" rel="nofollow">https://foo_bar.example.com/</a></p>`)
 | 
			
		||||
 | 
			
		||||
	// Test that should *not* be turned into URL
 | 
			
		||||
	test(
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -0,0 +1,27 @@
 | 
			
		|||
Copyright (c) 2015, Daniel Martí. All rights reserved.
 | 
			
		||||
 | 
			
		||||
Redistribution and use in source and binary forms, with or without
 | 
			
		||||
modification, are permitted provided that the following conditions are
 | 
			
		||||
met:
 | 
			
		||||
 | 
			
		||||
   * Redistributions of source code must retain the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer.
 | 
			
		||||
   * Redistributions in binary form must reproduce the above
 | 
			
		||||
copyright notice, this list of conditions and the following disclaimer
 | 
			
		||||
in the documentation and/or other materials provided with the
 | 
			
		||||
distribution.
 | 
			
		||||
   * Neither the name of the copyright holder nor the names of its
 | 
			
		||||
contributors may be used to endorse or promote products derived from
 | 
			
		||||
this software without specific prior written permission.
 | 
			
		||||
 | 
			
		||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 | 
			
		||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 | 
			
		||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 | 
			
		||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 | 
			
		||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 | 
			
		||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 | 
			
		||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 | 
			
		||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 | 
			
		||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 | 
			
		||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 | 
			
		||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,299 @@
 | 
			
		|||
// Generated by schemesgen
 | 
			
		||||
 | 
			
		||||
package xurls
 | 
			
		||||
 | 
			
		||||
// Schemes is a sorted list of all IANA assigned schemes.
 | 
			
		||||
//
 | 
			
		||||
// Source:
 | 
			
		||||
//   https://www.iana.org/assignments/uri-schemes/uri-schemes-1.csv
 | 
			
		||||
var Schemes = []string{
 | 
			
		||||
	`aaa`,
 | 
			
		||||
	`aaas`,
 | 
			
		||||
	`about`,
 | 
			
		||||
	`acap`,
 | 
			
		||||
	`acct`,
 | 
			
		||||
	`acr`,
 | 
			
		||||
	`adiumxtra`,
 | 
			
		||||
	`afp`,
 | 
			
		||||
	`afs`,
 | 
			
		||||
	`aim`,
 | 
			
		||||
	`appdata`,
 | 
			
		||||
	`apt`,
 | 
			
		||||
	`attachment`,
 | 
			
		||||
	`aw`,
 | 
			
		||||
	`barion`,
 | 
			
		||||
	`beshare`,
 | 
			
		||||
	`bitcoin`,
 | 
			
		||||
	`bitcoincash`,
 | 
			
		||||
	`blob`,
 | 
			
		||||
	`bolo`,
 | 
			
		||||
	`browserext`,
 | 
			
		||||
	`callto`,
 | 
			
		||||
	`cap`,
 | 
			
		||||
	`chrome`,
 | 
			
		||||
	`chrome-extension`,
 | 
			
		||||
	`cid`,
 | 
			
		||||
	`coap`,
 | 
			
		||||
	`coap+tcp`,
 | 
			
		||||
	`coap+ws`,
 | 
			
		||||
	`coaps`,
 | 
			
		||||
	`coaps+tcp`,
 | 
			
		||||
	`coaps+ws`,
 | 
			
		||||
	`com-eventbrite-attendee`,
 | 
			
		||||
	`content`,
 | 
			
		||||
	`conti`,
 | 
			
		||||
	`crid`,
 | 
			
		||||
	`cvs`,
 | 
			
		||||
	`data`,
 | 
			
		||||
	`dav`,
 | 
			
		||||
	`diaspora`,
 | 
			
		||||
	`dict`,
 | 
			
		||||
	`did`,
 | 
			
		||||
	`dis`,
 | 
			
		||||
	`dlna-playcontainer`,
 | 
			
		||||
	`dlna-playsingle`,
 | 
			
		||||
	`dns`,
 | 
			
		||||
	`dntp`,
 | 
			
		||||
	`dtn`,
 | 
			
		||||
	`dvb`,
 | 
			
		||||
	`ed2k`,
 | 
			
		||||
	`elsi`,
 | 
			
		||||
	`example`,
 | 
			
		||||
	`facetime`,
 | 
			
		||||
	`fax`,
 | 
			
		||||
	`feed`,
 | 
			
		||||
	`feedready`,
 | 
			
		||||
	`file`,
 | 
			
		||||
	`filesystem`,
 | 
			
		||||
	`finger`,
 | 
			
		||||
	`fish`,
 | 
			
		||||
	`ftp`,
 | 
			
		||||
	`geo`,
 | 
			
		||||
	`gg`,
 | 
			
		||||
	`git`,
 | 
			
		||||
	`gizmoproject`,
 | 
			
		||||
	`go`,
 | 
			
		||||
	`gopher`,
 | 
			
		||||
	`graph`,
 | 
			
		||||
	`gtalk`,
 | 
			
		||||
	`h323`,
 | 
			
		||||
	`ham`,
 | 
			
		||||
	`hcap`,
 | 
			
		||||
	`hcp`,
 | 
			
		||||
	`http`,
 | 
			
		||||
	`https`,
 | 
			
		||||
	`hxxp`,
 | 
			
		||||
	`hxxps`,
 | 
			
		||||
	`hydrazone`,
 | 
			
		||||
	`iax`,
 | 
			
		||||
	`icap`,
 | 
			
		||||
	`icon`,
 | 
			
		||||
	`im`,
 | 
			
		||||
	`imap`,
 | 
			
		||||
	`info`,
 | 
			
		||||
	`iotdisco`,
 | 
			
		||||
	`ipn`,
 | 
			
		||||
	`ipp`,
 | 
			
		||||
	`ipps`,
 | 
			
		||||
	`irc`,
 | 
			
		||||
	`irc6`,
 | 
			
		||||
	`ircs`,
 | 
			
		||||
	`iris`,
 | 
			
		||||
	`iris.beep`,
 | 
			
		||||
	`iris.lwz`,
 | 
			
		||||
	`iris.xpc`,
 | 
			
		||||
	`iris.xpcs`,
 | 
			
		||||
	`isostore`,
 | 
			
		||||
	`itms`,
 | 
			
		||||
	`jabber`,
 | 
			
		||||
	`jar`,
 | 
			
		||||
	`jms`,
 | 
			
		||||
	`keyparc`,
 | 
			
		||||
	`lastfm`,
 | 
			
		||||
	`ldap`,
 | 
			
		||||
	`ldaps`,
 | 
			
		||||
	`lvlt`,
 | 
			
		||||
	`magnet`,
 | 
			
		||||
	`mailserver`,
 | 
			
		||||
	`mailto`,
 | 
			
		||||
	`maps`,
 | 
			
		||||
	`market`,
 | 
			
		||||
	`message`,
 | 
			
		||||
	`microsoft.windows.camera`,
 | 
			
		||||
	`microsoft.windows.camera.multipicker`,
 | 
			
		||||
	`microsoft.windows.camera.picker`,
 | 
			
		||||
	`mid`,
 | 
			
		||||
	`mms`,
 | 
			
		||||
	`modem`,
 | 
			
		||||
	`mongodb`,
 | 
			
		||||
	`moz`,
 | 
			
		||||
	`ms-access`,
 | 
			
		||||
	`ms-browser-extension`,
 | 
			
		||||
	`ms-drive-to`,
 | 
			
		||||
	`ms-enrollment`,
 | 
			
		||||
	`ms-excel`,
 | 
			
		||||
	`ms-gamebarservices`,
 | 
			
		||||
	`ms-gamingoverlay`,
 | 
			
		||||
	`ms-getoffice`,
 | 
			
		||||
	`ms-help`,
 | 
			
		||||
	`ms-infopath`,
 | 
			
		||||
	`ms-inputapp`,
 | 
			
		||||
	`ms-lockscreencomponent-config`,
 | 
			
		||||
	`ms-media-stream-id`,
 | 
			
		||||
	`ms-mixedrealitycapture`,
 | 
			
		||||
	`ms-officeapp`,
 | 
			
		||||
	`ms-people`,
 | 
			
		||||
	`ms-project`,
 | 
			
		||||
	`ms-powerpoint`,
 | 
			
		||||
	`ms-publisher`,
 | 
			
		||||
	`ms-restoretabcompanion`,
 | 
			
		||||
	`ms-screenclip`,
 | 
			
		||||
	`ms-screensketch`,
 | 
			
		||||
	`ms-search`,
 | 
			
		||||
	`ms-search-repair`,
 | 
			
		||||
	`ms-secondary-screen-controller`,
 | 
			
		||||
	`ms-secondary-screen-setup`,
 | 
			
		||||
	`ms-settings`,
 | 
			
		||||
	`ms-settings-airplanemode`,
 | 
			
		||||
	`ms-settings-bluetooth`,
 | 
			
		||||
	`ms-settings-camera`,
 | 
			
		||||
	`ms-settings-cellular`,
 | 
			
		||||
	`ms-settings-cloudstorage`,
 | 
			
		||||
	`ms-settings-connectabledevices`,
 | 
			
		||||
	`ms-settings-displays-topology`,
 | 
			
		||||
	`ms-settings-emailandaccounts`,
 | 
			
		||||
	`ms-settings-language`,
 | 
			
		||||
	`ms-settings-location`,
 | 
			
		||||
	`ms-settings-lock`,
 | 
			
		||||
	`ms-settings-nfctransactions`,
 | 
			
		||||
	`ms-settings-notifications`,
 | 
			
		||||
	`ms-settings-power`,
 | 
			
		||||
	`ms-settings-privacy`,
 | 
			
		||||
	`ms-settings-proximity`,
 | 
			
		||||
	`ms-settings-screenrotation`,
 | 
			
		||||
	`ms-settings-wifi`,
 | 
			
		||||
	`ms-settings-workplace`,
 | 
			
		||||
	`ms-spd`,
 | 
			
		||||
	`ms-sttoverlay`,
 | 
			
		||||
	`ms-transit-to`,
 | 
			
		||||
	`ms-useractivityset`,
 | 
			
		||||
	`ms-virtualtouchpad`,
 | 
			
		||||
	`ms-visio`,
 | 
			
		||||
	`ms-walk-to`,
 | 
			
		||||
	`ms-whiteboard`,
 | 
			
		||||
	`ms-whiteboard-cmd`,
 | 
			
		||||
	`ms-word`,
 | 
			
		||||
	`msnim`,
 | 
			
		||||
	`msrp`,
 | 
			
		||||
	`msrps`,
 | 
			
		||||
	`mtqp`,
 | 
			
		||||
	`mumble`,
 | 
			
		||||
	`mupdate`,
 | 
			
		||||
	`mvn`,
 | 
			
		||||
	`news`,
 | 
			
		||||
	`nfs`,
 | 
			
		||||
	`ni`,
 | 
			
		||||
	`nih`,
 | 
			
		||||
	`nntp`,
 | 
			
		||||
	`notes`,
 | 
			
		||||
	`ocf`,
 | 
			
		||||
	`oid`,
 | 
			
		||||
	`onenote`,
 | 
			
		||||
	`onenote-cmd`,
 | 
			
		||||
	`opaquelocktoken`,
 | 
			
		||||
	`openpgp4fpr`,
 | 
			
		||||
	`pack`,
 | 
			
		||||
	`palm`,
 | 
			
		||||
	`paparazzi`,
 | 
			
		||||
	`pkcs11`,
 | 
			
		||||
	`platform`,
 | 
			
		||||
	`pop`,
 | 
			
		||||
	`pres`,
 | 
			
		||||
	`prospero`,
 | 
			
		||||
	`proxy`,
 | 
			
		||||
	`pwid`,
 | 
			
		||||
	`psyc`,
 | 
			
		||||
	`qb`,
 | 
			
		||||
	`query`,
 | 
			
		||||
	`redis`,
 | 
			
		||||
	`rediss`,
 | 
			
		||||
	`reload`,
 | 
			
		||||
	`res`,
 | 
			
		||||
	`resource`,
 | 
			
		||||
	`rmi`,
 | 
			
		||||
	`rsync`,
 | 
			
		||||
	`rtmfp`,
 | 
			
		||||
	`rtmp`,
 | 
			
		||||
	`rtsp`,
 | 
			
		||||
	`rtsps`,
 | 
			
		||||
	`rtspu`,
 | 
			
		||||
	`secondlife`,
 | 
			
		||||
	`service`,
 | 
			
		||||
	`session`,
 | 
			
		||||
	`sftp`,
 | 
			
		||||
	`sgn`,
 | 
			
		||||
	`shttp`,
 | 
			
		||||
	`sieve`,
 | 
			
		||||
	`simpleledger`,
 | 
			
		||||
	`sip`,
 | 
			
		||||
	`sips`,
 | 
			
		||||
	`skype`,
 | 
			
		||||
	`smb`,
 | 
			
		||||
	`sms`,
 | 
			
		||||
	`smtp`,
 | 
			
		||||
	`snews`,
 | 
			
		||||
	`snmp`,
 | 
			
		||||
	`soap.beep`,
 | 
			
		||||
	`soap.beeps`,
 | 
			
		||||
	`soldat`,
 | 
			
		||||
	`spiffe`,
 | 
			
		||||
	`spotify`,
 | 
			
		||||
	`ssh`,
 | 
			
		||||
	`steam`,
 | 
			
		||||
	`stun`,
 | 
			
		||||
	`stuns`,
 | 
			
		||||
	`submit`,
 | 
			
		||||
	`svn`,
 | 
			
		||||
	`tag`,
 | 
			
		||||
	`teamspeak`,
 | 
			
		||||
	`tel`,
 | 
			
		||||
	`teliaeid`,
 | 
			
		||||
	`telnet`,
 | 
			
		||||
	`tftp`,
 | 
			
		||||
	`things`,
 | 
			
		||||
	`thismessage`,
 | 
			
		||||
	`tip`,
 | 
			
		||||
	`tn3270`,
 | 
			
		||||
	`tool`,
 | 
			
		||||
	`turn`,
 | 
			
		||||
	`turns`,
 | 
			
		||||
	`tv`,
 | 
			
		||||
	`udp`,
 | 
			
		||||
	`unreal`,
 | 
			
		||||
	`urn`,
 | 
			
		||||
	`ut2004`,
 | 
			
		||||
	`v-event`,
 | 
			
		||||
	`vemmi`,
 | 
			
		||||
	`ventrilo`,
 | 
			
		||||
	`videotex`,
 | 
			
		||||
	`vnc`,
 | 
			
		||||
	`view-source`,
 | 
			
		||||
	`wais`,
 | 
			
		||||
	`webcal`,
 | 
			
		||||
	`wpid`,
 | 
			
		||||
	`ws`,
 | 
			
		||||
	`wss`,
 | 
			
		||||
	`wtai`,
 | 
			
		||||
	`wyciwyg`,
 | 
			
		||||
	`xcon`,
 | 
			
		||||
	`xcon-userid`,
 | 
			
		||||
	`xfire`,
 | 
			
		||||
	`xmlrpc.beep`,
 | 
			
		||||
	`xmlrpc.beeps`,
 | 
			
		||||
	`xmpp`,
 | 
			
		||||
	`xri`,
 | 
			
		||||
	`ymsgr`,
 | 
			
		||||
	`z39.50`,
 | 
			
		||||
	`z39.50r`,
 | 
			
		||||
	`z39.50s`,
 | 
			
		||||
}
 | 
			
		||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| 
						 | 
				
			
			@ -0,0 +1,24 @@
 | 
			
		|||
// Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc>
 | 
			
		||||
// See LICENSE for licensing information
 | 
			
		||||
 | 
			
		||||
package xurls
 | 
			
		||||
 | 
			
		||||
// PseudoTLDs is a sorted list of some widely used unofficial TLDs.
 | 
			
		||||
//
 | 
			
		||||
// Sources:
 | 
			
		||||
//  * https://en.wikipedia.org/wiki/Pseudo-top-level_domain
 | 
			
		||||
//  * https://en.wikipedia.org/wiki/Category:Pseudo-top-level_domains
 | 
			
		||||
//  * https://tools.ietf.org/html/draft-grothoff-iesg-special-use-p2p-names-00
 | 
			
		||||
//  * https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.xhtml
 | 
			
		||||
var PseudoTLDs = []string{
 | 
			
		||||
	`bit`,       // Namecoin
 | 
			
		||||
	`example`,   // Example domain
 | 
			
		||||
	`exit`,      // Tor exit node
 | 
			
		||||
	`gnu`,       // GNS by public key
 | 
			
		||||
	`i2p`,       // I2P network
 | 
			
		||||
	`invalid`,   // Invalid domain
 | 
			
		||||
	`local`,     // Local network
 | 
			
		||||
	`localhost`, // Local network
 | 
			
		||||
	`test`,      // Test domain
 | 
			
		||||
	`zkey`,      // GNS domain name
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,107 @@
 | 
			
		|||
// Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc>
 | 
			
		||||
// See LICENSE for licensing information
 | 
			
		||||
 | 
			
		||||
// Package xurls extracts urls from plain text using regular expressions.
 | 
			
		||||
package xurls
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"bytes"
 | 
			
		||||
	"regexp"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
//go:generate go run generate/tldsgen/main.go
 | 
			
		||||
//go:generate go run generate/schemesgen/main.go
 | 
			
		||||
 | 
			
		||||
const (
 | 
			
		||||
	letter    = `\p{L}`
 | 
			
		||||
	mark      = `\p{M}`
 | 
			
		||||
	number    = `\p{N}`
 | 
			
		||||
	iriChar   = letter + mark + number
 | 
			
		||||
	currency  = `\p{Sc}`
 | 
			
		||||
	otherSymb = `\p{So}`
 | 
			
		||||
	endChar   = iriChar + `/\-+_&~*%=#` + currency + otherSymb
 | 
			
		||||
	otherPunc = `\p{Po}`
 | 
			
		||||
	midChar   = endChar + `|` + otherPunc
 | 
			
		||||
	wellParen = `\([` + midChar + `]*(\([` + midChar + `]*\)[` + midChar + `]*)*\)`
 | 
			
		||||
	wellBrack = `\[[` + midChar + `]*(\[[` + midChar + `]*\][` + midChar + `]*)*\]`
 | 
			
		||||
	wellBrace = `\{[` + midChar + `]*(\{[` + midChar + `]*\}[` + midChar + `]*)*\}`
 | 
			
		||||
	wellAll   = wellParen + `|` + wellBrack + `|` + wellBrace
 | 
			
		||||
	pathCont  = `([` + midChar + `]*(` + wellAll + `|[` + endChar + `])+)+`
 | 
			
		||||
 | 
			
		||||
	iri      = `[` + iriChar + `]([` + iriChar + `\-]*[` + iriChar + `])?`
 | 
			
		||||
	domain   = `(` + iri + `\.)+`
 | 
			
		||||
	octet    = `(25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])`
 | 
			
		||||
	ipv4Addr = `\b` + octet + `\.` + octet + `\.` + octet + `\.` + octet + `\b`
 | 
			
		||||
	ipv6Addr = `([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:[0-9a-fA-F]{0,4}|:[0-9a-fA-F]{1,4})?|(:[0-9a-fA-F]{1,4}){0,2})|(:[0-9a-fA-F]{1,4}){0,3})|(:[0-9a-fA-F]{1,4}){0,4})|:(:[0-9a-fA-F]{1,4}){0,5})((:[0-9a-fA-F]{1,4}){2}|:(25[0-5]|(2[0-4]|1[0-9]|[1-9])?[0-9])(\.(25[0-5]|(2[0-4]|1[0-9]|[1-9])?[0-9])){3})|(([0-9a-fA-F]{1,4}:){1,6}|:):[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){7}:`
 | 
			
		||||
	ipAddr   = `(` + ipv4Addr + `|` + ipv6Addr + `)`
 | 
			
		||||
	port     = `(:[0-9]*)?`
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
// AnyScheme can be passed to StrictMatchingScheme to match any possibly valid
 | 
			
		||||
// scheme, and not just the known ones.
 | 
			
		||||
var AnyScheme = `([a-zA-Z][a-zA-Z.\-+]*://|` + anyOf(SchemesNoAuthority...) + `:)`
 | 
			
		||||
 | 
			
		||||
// SchemesNoAuthority is a sorted list of some well-known url schemes that are
 | 
			
		||||
// followed by ":" instead of "://".
 | 
			
		||||
var SchemesNoAuthority = []string{
 | 
			
		||||
	`bitcoin`, // Bitcoin
 | 
			
		||||
	`file`,    // Files
 | 
			
		||||
	`magnet`,  // Torrent magnets
 | 
			
		||||
	`mailto`,  // Mail
 | 
			
		||||
	`sms`,     // SMS
 | 
			
		||||
	`tel`,     // Telephone
 | 
			
		||||
	`xmpp`,    // XMPP
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func anyOf(strs ...string) string {
 | 
			
		||||
	var b bytes.Buffer
 | 
			
		||||
	b.WriteByte('(')
 | 
			
		||||
	for i, s := range strs {
 | 
			
		||||
		if i != 0 {
 | 
			
		||||
			b.WriteByte('|')
 | 
			
		||||
		}
 | 
			
		||||
		b.WriteString(regexp.QuoteMeta(s))
 | 
			
		||||
	}
 | 
			
		||||
	b.WriteByte(')')
 | 
			
		||||
	return b.String()
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func strictExp() string {
 | 
			
		||||
	schemes := `(` + anyOf(Schemes...) + `://|` + anyOf(SchemesNoAuthority...) + `:)`
 | 
			
		||||
	return `(?i)` + schemes + `(?-i)` + pathCont
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func relaxedExp() string {
 | 
			
		||||
	site := domain + `(?i)` + anyOf(append(TLDs, PseudoTLDs...)...) + `(?-i)`
 | 
			
		||||
	hostName := `(` + site + `|` + ipAddr + `)`
 | 
			
		||||
	webURL := hostName + port + `(/|/` + pathCont + `?|\b|$)`
 | 
			
		||||
	return strictExp() + `|` + webURL
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Strict produces a regexp that matches any URL with a scheme in either the
 | 
			
		||||
// Schemes or SchemesNoAuthority lists.
 | 
			
		||||
func Strict() *regexp.Regexp {
 | 
			
		||||
	re := regexp.MustCompile(strictExp())
 | 
			
		||||
	re.Longest()
 | 
			
		||||
	return re
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Relaxed produces a regexp that matches any URL matched by Strict, plus any
 | 
			
		||||
// URL with no scheme.
 | 
			
		||||
func Relaxed() *regexp.Regexp {
 | 
			
		||||
	re := regexp.MustCompile(relaxedExp())
 | 
			
		||||
	re.Longest()
 | 
			
		||||
	return re
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// StrictMatchingScheme produces a regexp similar to Strict, but requiring that
 | 
			
		||||
// the scheme match the given regular expression. See AnyScheme too.
 | 
			
		||||
func StrictMatchingScheme(exp string) (*regexp.Regexp, error) {
 | 
			
		||||
	strictMatching := `(?i)(` + exp + `)(?-i)` + pathCont
 | 
			
		||||
	re, err := regexp.Compile(strictMatching)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		return nil, err
 | 
			
		||||
	}
 | 
			
		||||
	re.Longest()
 | 
			
		||||
	return re, nil
 | 
			
		||||
}
 | 
			
		||||
		Loading…
	
		Reference in New Issue