Euphoria Ticket #827: function parse in std/net/url.e does not parse correctly

function parse does not parse correctly many URLs, like the ones provided by Google Search with "site:" in the query string, or many others among those below.

I propose the following replacement, here within its test program:

include std/sequence.e 
include std/convert.e 
 
constant URLS = { 
 "http://www.google.com", 
 "http://www.google.com/", 
 "http://www.google.com:8080", 
 "http://www.google.com:8080/", 
 "www.google.fr:8080", 
 "www.google.fr:8080/", 
 "http://user:pass@www.debian.org:80/index.html?name=John&age=39", 
 "http://www.google.com/search?q=allintext:+openstack+folsom&hl=en&ie=UTF-8&oe=UTF-8&prmd=ivns&ei=iyvPUOjOLqLL0AWYvYHoCQ&start=10&sa=N", 
 "file:///etc/hosts", 
 "file:////remotehost/share/dir/file.txt", 
 "file://localhost///remotehost/share/dir/file.txt", 
 "file://///remotehost/share/dir/file.txt", 
 "file:///home/labo/00:14:22:fb:52:01_2011-04-29-15-img/Info-dmi.txt", 
 "ftp://ftp.is.co.za/rfc/rfc1808.txt", 
 "ldap://[2001:db8::7]/c=GB?objectClass?one", 
 "telnet://192.0.2.16:80/", 
 "mailto:John.Doe@example.com", 
 "news:comp.infosystems.www.servers.unix" 
} 
 
function parseAddress(sequence url) 
  sequence host_name = "", user_name = "", password = "" 
  integer port = 0, delim = 0, p = 0 
 
  delim = match("@", url) 
  if delim then 
    {user_name, password} = split(url[1..delim-1], ':') 
    p = match(":", url, delim) 
    if p then  -- port 
      host_name = url[delim+1..p-1] 
      port = to_number(url[p+1..$]) 
    else 
      host_name = url[delim+1..$] 
    end if 
  else 
    p = match(":", url) 
    if p then  -- port 
      host_name = url[1..p-1] 
      port = to_number(url[p+1..$]) 
    else 
      host_name = url[1..$] 
    end if 
  end if 
  return { host_name, port, user_name, password } 
end function 
 
function parse(sequence url) 
  sequence protocol = "", s 
  sequence host_name = "", path = "", user_name = "", password = "", query_string = "" 
  integer port = 0, nbSlashes = 0, delim1 = 0, delim2 = 0, delim3 =0, p = 0 
 
  delim1 = match("://", url) 
  if delim1 then 
    protocol = url[1..delim1-1] 
    p = delim1+1 
    while url[p] = '/' do  -- there might by up to 5 slashes as in 
                           -- "file://///remotehost/share/dir/file.txt" 
                           -- (http://en.wikipedia.org/wiki/File_URI_scheme)  
      nbSlashes += 1 
      p += 1 
    end while 
    if (nbSlashes>2) and not equal(protocol, "file") then 
      printf(1, "Invalid slash number (%d) with protocol %s\n", {nbSlashes, protocol}) 
      return -1 
    else 
      switch nbSlashes do 
        case 2 then 
          delim2 = match("/", url, p) 
          if equal(protocol, "ldap") then 
            if delim2 then 
              host_name = url[p..delim2-1] 
              delim3 = match("?", url, delim2+1) 
              if delim3 then 
                path = url[delim2+1..delim3-1] 
                query_string = url[delim3+1..$] 
              else 
                path = url[delim2+1..$] 
              end if 
            else 
              host_name = url[p..$] 
            end if 
          elsif equal(protocol, "file") then 
            if delim2 and equal(url[delim2..delim2+2], "///") then  --"file://localhost///remotehost/" 
              delim3 = match("/", url, delim2+3) 
              if delim3 then 
                host_name = url[delim2+3..delim3-1] 
                path = url[delim3+1..$] 
              else 
                host_name = url[delim2+3..$] 
              end if 
            else 
              {host_name, port, user_name, password } = parseAddress(url[p..$]) 
            end if 
          else 
            if delim2 then 
              {host_name, port, user_name, password } = parseAddress(url[p..delim2-1]) 
              delim3 = match("?", url, delim2+1) 
              if delim3 then 
                path = url[delim2+1..delim3-1] 
                query_string = url[delim3+1..$] 
              else 
                path = url[delim2+1..$] 
              end if 
            else 
              {host_name, port, user_name, password } = parseAddress(url[p..$]) 
            end if 
          end if 
        case 3 then 
          host_name = "localhost" 
          path = url[p-1..$] 
        case 4 then 
          delim2 = match("/", url, p) 
          if delim2 then 
            host_name = url[p..delim2-1] 
            path = url[delim2+1..$] 
          else 
            host_name = url[p..$] 
          end if 
        case 5 then 
          delim2 = match("/", url, p) 
          if delim2 then 
            host_name = url[p..delim2-1] 
            path = url[delim2+1..$] 
          else 
            host_name = url[p..$] 
          end if 
        case else 
          printf(1, "Invalid slash number (%d) with protocol %s\n", {nbSlashes, protocol}) 
          return -1 
      end switch  
    end if 
  else 
    delim1 = match(":", url) 
    if delim1 then 
      s = url[1..delim1-1] 
      if equal(s, "mailto") then 
        protocol = "mailto" 
        {user_name, host_name} = split(url[delim1+1..$], '@') 
      elsif equal(s, "news") then 
        protocol = "news" 
        delim2 = match("/", url, delim1+1) 
        if delim2 then 
          host_name = url[delim1+1..delim2-1] 
        else 
          host_name = url[delim1+1..$] 
        end if 
      else 
        protocol = "http" 
        delim2 = match("/", url, delim1+1) 
        if delim2 then 
          {host_name, port, user_name, password } = parseAddress(url[1..delim2-1]) 
          delim3 = match("?", url, delim2+1) 
          if delim3 then 
            path = url[delim2+1..delim3-1] 
            query_string = url[delim3+1..$] 
          else 
            path = url[delim2+1..$] 
          end if 
        else 
          {host_name, port, user_name, password } = parseAddress(url[1..$]) 
        end if 
      end if 
    end if 
  end if 
  return { protocol, host_name, port, path, user_name, password, query_string } 
end function 
 
sequence parsed 
 
puts(1, "\n#######################################################################\n") 
for i = 1 to length(URLS) do 
  parsed = parse(URLS[i]) 
  puts(1, "\n"&URLS[i]&"\n") 
  puts(1, "  Protocol .. : "&parsed[1]&"\n") 
  puts(1, "  Host name . : "&parsed[2]&"\n") 
  printf(1, "  Port ...... : %d\n", parsed[3]) 
  puts(1, "  Path ...... : "&parsed[4]&"\n") 
  puts(1, "  User name . : "&parsed[5]&"\n") 
  puts(1, "  Password .. :"&parsed[6]&"\n") 
  puts(1, "  Query string: "&parsed[7]&"\n") 
end for 
 

Regards

Jean-Marc

Details

Type: Bug Report Severity: Normal Category: Library Routine
Assigned To: unknown Status: New Reported Release: v4.1.0 development
Fixed in SVN #: View VCS: none Milestone:

1. Comment by jmduro Dec 18, 2012

A little change to be fully compliant with the original function which returns objects in the sequence and not only sequences as my previous one:

function parseAddress(sequence url) 
  object host_name = 0, user_name = 0, password = 0 
  integer port = 0, delim = 0, p = 0 
 
  delim = match("@", url) 
  if delim then 
    {user_name, password} = split(url[1..delim-1], ':') 
    p = match(":", url, delim) 
    if p then  -- port 
      host_name = url[delim+1..p-1] 
      port = to_number(url[p+1..$]) 
    else 
      host_name = url[delim+1..$] 
    end if 
  else 
    p = match(":", url) 
    if p then  -- port 
      host_name = url[1..p-1] 
      port = to_number(url[p+1..$]) 
    else 
      host_name = url[1..$] 
    end if 
  end if 
  return { host_name, port, user_name, password } 
end function 
 
public function parse(sequence url, integer querystring_also=0) 
  sequence protocol = "", s 
  object host_name = 0, path = 0, user_name = 0, password = 0, query_string = 0 
  integer port = 0, nbSlashes = 0, delim1 = 0, delim2 = 0, delim3 =0, p = 0 
 
  delim1 = match("://", url) 
  if delim1 then 
    protocol = url[1..delim1-1] 
    p = delim1+1 
    while url[p] = '/' do  -- there might by up to 5 slashes as in "file://///remotehost/share/dir/file.txt" (http://en.wikipedia.org/wiki/File_URI_scheme)  
      nbSlashes += 1 
      p += 1 
    end while 
    if (nbSlashes>2) and not equal(protocol, "file") then 
      printf(1, "Invalid slash number (%d) with protocol %s\n", {nbSlashes, protocol}) 
      return -1 
    else 
      switch nbSlashes do 
        case 2 then 
          delim2 = match("/", url, p) 
          if equal(protocol, "ldap") then 
            if delim2 then 
              host_name = url[p..delim2-1] 
              delim3 = match("?", url, delim2+1) 
              if delim3 then 
                path = url[delim2..delim3-1] 
                query_string = url[delim3+1..$] 
              else 
                path = url[delim2..$] 
              end if 
            else 
              host_name = url[p..$] 
            end if 
          elsif equal(protocol, "file") then 
            if delim2 and equal(url[delim2..delim2+2], "///") then  -- "file://localhost///remotehost/" 
              delim3 = match("/", url, delim2+3) 
              if delim3 then 
                host_name = url[delim2+3..delim3-1] 
                path = url[delim3..$] 
              else 
                host_name = url[delim2+3..$] 
              end if 
            else 
              {host_name, port, user_name, password } = parseAddress(url[p..$]) 
            end if 
          else 
            if delim2 then 
              {host_name, port, user_name, password } = parseAddress(url[p..delim2-1]) 
              delim3 = match("?", url, delim2+1) 
              if delim3 then 
                path = url[delim2..delim3-1] 
                query_string = url[delim3+1..$] 
              else 
                path = url[delim2..$] 
              end if 
            else 
              {host_name, port, user_name, password } = parseAddress(url[p..$]) 
            end if 
          end if 
        case 3 then 
          host_name = "localhost" 
          path = url[p-1..$] 
        case 4 then 
          delim2 = match("/", url, p) 
          if delim2 then 
            host_name = url[p..delim2-1] 
            path = url[delim2..$] 
          else 
            host_name = url[p..$] 
          end if 
        case 5 then 
          delim2 = match("/", url, p) 
          if delim2 then 
            host_name = url[p..delim2-1] 
            path = url[delim2..$] 
          else 
            host_name = url[p..$] 
          end if 
        case else 
          printf(1, "Invalid slash number (%d) with protocol %s\n", {nbSlashes, protocol}) 
          return -1 
      end switch  
    end if 
  else 
    delim1 = match(":", url) 
    if delim1 then 
      s = url[1..delim1-1] 
      if equal(s, "mailto") then 
        protocol = "mailto" 
        {user_name, host_name} = split(url[delim1+1..$], '@') 
      elsif equal(s, "news") then 
        protocol = "news" 
        delim2 = match("/", url, delim1+1) 
        if delim2 then 
          host_name = url[delim1+1..delim2-1] 
        else 
          host_name = url[delim1+1..$] 
        end if 
      else 
        protocol = "http" 
        delim2 = match("/", url, delim1+1) 
        if delim2 then 
          {host_name, port, user_name, password } = parseAddress(url[1..delim2-1]) 
          delim3 = match("?", url, delim2+1) 
          if delim3 then 
            path = url[delim2..delim3-1] 
            query_string = url[delim3+1..$] 
          else 
            path = url[delim2..$] 
          end if 
        else 
          {host_name, port, user_name, password } = parseAddress(url[1..$]) 
        end if 
      end if 
    end if 
  end if 
  if querystring_also and length(query_string) then 
    query_string = parse_querystring(query_string) 
  end if 
  return { protocol, host_name, port, path, user_name, password, query_string } 
end function 

Two include lines have to be added:

include std/sequence.e 
include std/convert.e 

Search



Quick Links

User menu

Not signed in.

Misc Menu