Euphoria
Ticket #827:
function parse in std/net/url.e does not parse correctly
-
Reported by
jmduro
Dec 18, 2012
function parse does not parse correctly many URLs, like the ones provided by Google Search with "site:" in the query string, or many others among those below.
I propose the following replacement, here within its test program:
include std/sequence.e
include std/convert.e
constant URLS = {
"http://www.google.com",
"http://www.google.com/",
"http://www.google.com:8080",
"http://www.google.com:8080/",
"www.google.fr:8080",
"www.google.fr:8080/",
"http://user:pass@www.debian.org:80/index.html?name=John&age=39",
"http://www.google.com/search?q=allintext:+openstack+folsom&hl=en&ie=UTF-8&oe=UTF-8&prmd=ivns&ei=iyvPUOjOLqLL0AWYvYHoCQ&start=10&sa=N",
"file:///etc/hosts",
"file:////remotehost/share/dir/file.txt",
"file://localhost///remotehost/share/dir/file.txt",
"file://///remotehost/share/dir/file.txt",
"file:///home/labo/00:14:22:fb:52:01_2011-04-29-15-img/Info-dmi.txt",
"ftp://ftp.is.co.za/rfc/rfc1808.txt",
"ldap://[2001:db8::7]/c=GB?objectClass?one",
"telnet://192.0.2.16:80/",
"mailto:John.Doe@example.com",
"news:comp.infosystems.www.servers.unix"
}
function parseAddress(sequence url)
sequence host_name = "", user_name = "", password = ""
integer port = 0, delim = 0, p = 0
delim = match("@", url)
if delim then
{user_name, password} = split(url[1..delim-1], ':')
p = match(":", url, delim)
if p then -- port
host_name = url[delim+1..p-1]
port = to_number(url[p+1..$])
else
host_name = url[delim+1..$]
end if
else
p = match(":", url)
if p then -- port
host_name = url[1..p-1]
port = to_number(url[p+1..$])
else
host_name = url[1..$]
end if
end if
return { host_name, port, user_name, password }
end function
function parse(sequence url)
sequence protocol = "", s
sequence host_name = "", path = "", user_name = "", password = "", query_string = ""
integer port = 0, nbSlashes = 0, delim1 = 0, delim2 = 0, delim3 =0, p = 0
delim1 = match("://", url)
if delim1 then
protocol = url[1..delim1-1]
p = delim1+1
while url[p] = '/' do -- there might by up to 5 slashes as in
-- "file://///remotehost/share/dir/file.txt"
-- (http://en.wikipedia.org/wiki/File_URI_scheme)
nbSlashes += 1
p += 1
end while
if (nbSlashes>2) and not equal(protocol, "file") then
printf(1, "Invalid slash number (%d) with protocol %s\n", {nbSlashes, protocol})
return -1
else
switch nbSlashes do
case 2 then
delim2 = match("/", url, p)
if equal(protocol, "ldap") then
if delim2 then
host_name = url[p..delim2-1]
delim3 = match("?", url, delim2+1)
if delim3 then
path = url[delim2+1..delim3-1]
query_string = url[delim3+1..$]
else
path = url[delim2+1..$]
end if
else
host_name = url[p..$]
end if
elsif equal(protocol, "file") then
if delim2 and equal(url[delim2..delim2+2], "///") then --"file://localhost///remotehost/"
delim3 = match("/", url, delim2+3)
if delim3 then
host_name = url[delim2+3..delim3-1]
path = url[delim3+1..$]
else
host_name = url[delim2+3..$]
end if
else
{host_name, port, user_name, password } = parseAddress(url[p..$])
end if
else
if delim2 then
{host_name, port, user_name, password } = parseAddress(url[p..delim2-1])
delim3 = match("?", url, delim2+1)
if delim3 then
path = url[delim2+1..delim3-1]
query_string = url[delim3+1..$]
else
path = url[delim2+1..$]
end if
else
{host_name, port, user_name, password } = parseAddress(url[p..$])
end if
end if
case 3 then
host_name = "localhost"
path = url[p-1..$]
case 4 then
delim2 = match("/", url, p)
if delim2 then
host_name = url[p..delim2-1]
path = url[delim2+1..$]
else
host_name = url[p..$]
end if
case 5 then
delim2 = match("/", url, p)
if delim2 then
host_name = url[p..delim2-1]
path = url[delim2+1..$]
else
host_name = url[p..$]
end if
case else
printf(1, "Invalid slash number (%d) with protocol %s\n", {nbSlashes, protocol})
return -1
end switch
end if
else
delim1 = match(":", url)
if delim1 then
s = url[1..delim1-1]
if equal(s, "mailto") then
protocol = "mailto"
{user_name, host_name} = split(url[delim1+1..$], '@')
elsif equal(s, "news") then
protocol = "news"
delim2 = match("/", url, delim1+1)
if delim2 then
host_name = url[delim1+1..delim2-1]
else
host_name = url[delim1+1..$]
end if
else
protocol = "http"
delim2 = match("/", url, delim1+1)
if delim2 then
{host_name, port, user_name, password } = parseAddress(url[1..delim2-1])
delim3 = match("?", url, delim2+1)
if delim3 then
path = url[delim2+1..delim3-1]
query_string = url[delim3+1..$]
else
path = url[delim2+1..$]
end if
else
{host_name, port, user_name, password } = parseAddress(url[1..$])
end if
end if
end if
end if
return { protocol, host_name, port, path, user_name, password, query_string }
end function
sequence parsed
puts(1, "\n#######################################################################\n")
for i = 1 to length(URLS) do
parsed = parse(URLS[i])
puts(1, "\n"&URLS[i]&"\n")
puts(1, " Protocol .. : "&parsed[1]&"\n")
puts(1, " Host name . : "&parsed[2]&"\n")
printf(1, " Port ...... : %d\n", parsed[3])
puts(1, " Path ...... : "&parsed[4]&"\n")
puts(1, " User name . : "&parsed[5]&"\n")
puts(1, " Password .. :"&parsed[6]&"\n")
puts(1, " Query string: "&parsed[7]&"\n")
end for
Regards
Jean-Marc
Details
1. Comment by jmduro
Dec 18, 2012
A little change to be fully compliant with the original function which returns objects in the sequence and not only sequences as my previous one:
function parseAddress(sequence url)
object host_name = 0, user_name = 0, password = 0
integer port = 0, delim = 0, p = 0
delim = match("@", url)
if delim then
{user_name, password} = split(url[1..delim-1], ':')
p = match(":", url, delim)
if p then -- port
host_name = url[delim+1..p-1]
port = to_number(url[p+1..$])
else
host_name = url[delim+1..$]
end if
else
p = match(":", url)
if p then -- port
host_name = url[1..p-1]
port = to_number(url[p+1..$])
else
host_name = url[1..$]
end if
end if
return { host_name, port, user_name, password }
end function
public function parse(sequence url, integer querystring_also=0)
sequence protocol = "", s
object host_name = 0, path = 0, user_name = 0, password = 0, query_string = 0
integer port = 0, nbSlashes = 0, delim1 = 0, delim2 = 0, delim3 =0, p = 0
delim1 = match("://", url)
if delim1 then
protocol = url[1..delim1-1]
p = delim1+1
while url[p] = '/' do -- there might by up to 5 slashes as in "file://///remotehost/share/dir/file.txt" (http://en.wikipedia.org/wiki/File_URI_scheme)
nbSlashes += 1
p += 1
end while
if (nbSlashes>2) and not equal(protocol, "file") then
printf(1, "Invalid slash number (%d) with protocol %s\n", {nbSlashes, protocol})
return -1
else
switch nbSlashes do
case 2 then
delim2 = match("/", url, p)
if equal(protocol, "ldap") then
if delim2 then
host_name = url[p..delim2-1]
delim3 = match("?", url, delim2+1)
if delim3 then
path = url[delim2..delim3-1]
query_string = url[delim3+1..$]
else
path = url[delim2..$]
end if
else
host_name = url[p..$]
end if
elsif equal(protocol, "file") then
if delim2 and equal(url[delim2..delim2+2], "///") then -- "file://localhost///remotehost/"
delim3 = match("/", url, delim2+3)
if delim3 then
host_name = url[delim2+3..delim3-1]
path = url[delim3..$]
else
host_name = url[delim2+3..$]
end if
else
{host_name, port, user_name, password } = parseAddress(url[p..$])
end if
else
if delim2 then
{host_name, port, user_name, password } = parseAddress(url[p..delim2-1])
delim3 = match("?", url, delim2+1)
if delim3 then
path = url[delim2..delim3-1]
query_string = url[delim3+1..$]
else
path = url[delim2..$]
end if
else
{host_name, port, user_name, password } = parseAddress(url[p..$])
end if
end if
case 3 then
host_name = "localhost"
path = url[p-1..$]
case 4 then
delim2 = match("/", url, p)
if delim2 then
host_name = url[p..delim2-1]
path = url[delim2..$]
else
host_name = url[p..$]
end if
case 5 then
delim2 = match("/", url, p)
if delim2 then
host_name = url[p..delim2-1]
path = url[delim2..$]
else
host_name = url[p..$]
end if
case else
printf(1, "Invalid slash number (%d) with protocol %s\n", {nbSlashes, protocol})
return -1
end switch
end if
else
delim1 = match(":", url)
if delim1 then
s = url[1..delim1-1]
if equal(s, "mailto") then
protocol = "mailto"
{user_name, host_name} = split(url[delim1+1..$], '@')
elsif equal(s, "news") then
protocol = "news"
delim2 = match("/", url, delim1+1)
if delim2 then
host_name = url[delim1+1..delim2-1]
else
host_name = url[delim1+1..$]
end if
else
protocol = "http"
delim2 = match("/", url, delim1+1)
if delim2 then
{host_name, port, user_name, password } = parseAddress(url[1..delim2-1])
delim3 = match("?", url, delim2+1)
if delim3 then
path = url[delim2..delim3-1]
query_string = url[delim3+1..$]
else
path = url[delim2..$]
end if
else
{host_name, port, user_name, password } = parseAddress(url[1..$])
end if
end if
end if
end if
if querystring_also and length(query_string) then
query_string = parse_querystring(query_string)
end if
return { protocol, host_name, port, path, user_name, password, query_string }
end function
Two include lines have to be added:
include std/sequence.e
include std/convert.e