Pastey Rosetta 'natual' sorting challenge

-- Rosetta 'natural' sorting challenge
-- OE variant of Phix solution 
-- works without encoding transformations 
 
include std/sort.e 
include std/sequence.e 
include std/text.e 
include std/search.e 
include std/console.e 
include std/graphics.e 
 
 
function _nat_head( sequence s ) 
    return trim_head( s ) 
    end function 
    integer nat_head = routine_id( "_nat_head" ) 
    -- note that `trim` function cleans head and tail    
        
function _nat_adjacent( sequence s ) 
    while match( "  ", s ) do 
        s = match_replace( "  ", s, " " ) 
    end while 
    return s 
    end function 
    integer nat_adjacent = routine_id( "_nat_adjacent" ) 
     
function _nat_space( sequence s ) 
    s = transmute(s, 
        { {}, " ", "\r", "\x0c", "\x0b", "\n", "\t" }, 
        { {}, " ",  " ",    " ",    " ",  " ",  " " } ) 
       -- { {}, " ",  "\\\\r",    "\\\x0c",    "\\\\x0b",  "\\\\n",  "\\\\t" } ) 
    return s 
    end function 
    integer nat_space = routine_id( "_nat_space" ) 
     
function _nat_nocase( sequence s ) 
    return lower( s ) 
    end function 
    integer nat_nocase = routine_id( "_nat_nocase" ) 
     
function _nat_tonumber( sequence s ) 
    sequence res = "" 
    atom prev = ' ' 
    for i=1 to length(s) do 
        if find(s[i],"0123456789") then 
            if length(res)=0 or prev!='0' then 
                res &= s[i] - '0' 
            else 
                res[$] = res[$]*10 + ( s[i] -'0' ) 
            end if 
            prev = '0' 
        else 
            res &= s[i] 
        end if 
    end for 
    return res 
    end function 
    integer nat_tonumber = routine_id( "_nat_tonumber" ) 
    
function _nat_title( sequence s ) 
    sequence title = split(s) 
    if length(title) = 1 then  
        return s  
    elsif  equal( lower(title[1]), "the") then 
        return join( title[2..$] ) 
    else 
        return s 
    end if 
    end function 
    integer nat_title = routine_id( "_nat_title" ) 
 
      
function _nat_noaccent( sequence s )             
    sequence accents = { 
            {"?",'A'},{"?",'A'},{"?",'A'},{"?",'A'}, 
            {"?",'A'},{"?",'A'},{"?",'a'},{"?",'a'}, 
            {"?",'a'},{"?",'a'},{"?",'a'},{"?",'a'}, 
            {"?",'C'},{"?",'c'},{"?",'E'},{"?",'E'}, 
            {"?",'E'},{"?",'E'},{"?",'e'},{"?",'e'}, 
            {"?",'e'},{"?",'e'},{"?",'I'},{"?",'I'}, 
            {"?",'I'},{"?",'I'},{"?",'i'},{"?",'i'}, 
            {"?",'i'},{"?",'i'},{"?",'O'},{"?",'O'}, 
            {"?",'O'},{"?",'O'},{"?",'O'},{"?",'O'}, 
            {"?",'o'},{"?",'o'},{"?",'o'},{"?",'o'}, 
            {"?",'o'},{"?",'o'},{"?",'N'},{"?",'n'}, 
            {"?",'U'},{"?",'U'},{"?",'U'},{"?",'U'}, 
            {"?",'u'},{"?",'u'},{"?",'u'},{"?",'u'}, 
            {"?",'Y'},{"?",'y'},{"?",'y'} 
            } 
    sequence left,right 
    {left,right} = columnize( accents ) 
    s = transmute(s, {{}} & left, {{}} & right ) 
    return s 
    end function 
    integer nat_noaccent = routine_id( "_nat_noaccent" ) 
         
function _nat_ligature( sequence s ) 
    sequence ligatures = { 
            {"?","AE"},{"?","ae"},{"?","ss"}, 
            {"?","TH"},{"?","th"},{"?","TH"},{"?","th"}, 
            {"ffl", "ffl"}, {"ffi", "ffi"}, {"fi", "fi"}, {"ff", "ff"}, {"fl", "fl"}, 
            {"ſ", "s"}, {"ʒ", "z"}, {"st", "st"}  
                         } 
    sequence left,right 
    {left,right} = columnize( ligatures ) 
    s = transmute(s, {{}} & left, {{}} & right ) 
    return s 
    end function 
    integer nat_ligature = routine_id( "_nat_ligature" ) 
    
sequence tests = { 
                  { "# Ignoring leading spaces",        -- description 
                    { nat_head },                       -- naturalization function     
                    { "ignore leading spaces: 2-2",     -- test data 
                      " ignore leading spaces: 2-1",  
                      "  ignore leading spaces: 2+0",  
                      "   ignore leading spaces: 2+1"} }, 
 
                  {  "Ignore leading spaces",         
                     { nat_head } ,                   
                         {"    leading spaces: 4",    
                          "   leading spaces: 3", 
                          "  leading spaces: 2", 
                          " leading spaces: 1" } }, 
           
                  { "# Ignoring multiple adjacent spaces (m.a.s)", 
                    { nat_adjacent }, 
                    { "ignore m.a.s spaces: 2-2",  
                      "ignore m.a.s  spaces: 2-1",  
                      "ignore m.a.s   spaces: 2+0",  
                      "ignore m.a.s    spaces: 2+1" } }, 
           
                  { "Ignore multiple adjacent spaces",  
                    { nat_adjacent },  
                            {"adjacent   spaces: 3", 
                           "adjacent    spaces: 4", 
                           "adjacent spaces: 1", 
                           "adjacent  spaces: 2"} },                   
                    { "# Equivalent whitespace characters", 
                      { nat_space }, 
                      { "Equiv. spaces: 3-3",  
                        "Equiv.\rspaces: 3-2",  
                        "Equiv.\x0cspaces: 3-1",  
                        "Equiv.\x0bspaces: 3+0",  
                        "Equiv.\nspaces: 3+1",  
                        "Equiv.\tspaces: 3+2"} }, 
                   
                   
                  { "Equivalent whitespace characters",  
                     { nat_space },  
                             {"white space: 3-2", 
                           "white\rspace: 3-3", 
                           "white\x0cspace: 3-1", 
                           "white\x0bspace: 3+0", 
                           "white\nspace: 3+1", 
                           "white\tspace: 3+2"}},                   
           
           
                  { "# Case Indepenent sort", 
                    { nat_nocase }, 
                    { "cASE INDEPENENT: 3-2",  
                      "caSE INDEPENENT: 3-1",  
                      "casE INDEPENENT: 3+0",  
                      "case INDEPENENT: 3+1" } }, 
 
                     
                  { "Case independent",  
                    { nat_nocase},  
                          {"caSE independent: 3-1", 
                           "cASE independent: 3-2", 
                           "casE independent: 3+0", 
                           "case independent: 3+1"} }, 
 
                  { "# Numeric fields as numerics", 
                    {nat_tonumber}, 
                          { "foo100bar99baz0.txt",  
                            "foo100bar10baz0.txt",  
                            "foo1000bar99baz10.txt",  
                            "foo1000bar99baz9.txt" } }, 
 
                   
                  { "Numeric fields as numerics",  
                    {nat_tonumber},  
                         { "foo1000bar99baz9.txt", 
                           "foo100bar99baz0.txt", 
                           "foo100bar10baz0.txt", 
                           "foo1000bar99baz10.txt"} }, 
                         
                    
                  { "Numeric fields as numerics",  
                    {nat_tonumber}, 
                    {  "foo1bar", 
                        "foo100bar", 
                        "foo bar", 
                        "foo1000bar" } }, 
 
 
                  { "# Title sorts", 
                    {nat_title}, 
                    { "The Wind in the Willows",  
                      "The 40th step more",  
                      "The 39 steps",  
                      "Wanda" } }, 
 
                    
                  { "Ignore leading 'the'", 
                   {nat_title}, 
                           {"The Wind in the Willows", 
                           "The 40th step more", 
                           "The 39 steps", 
                           "Wanda" } }, 
           
                  { " Equivalent accented characters (and case) [literal]",  
                    {nat_noaccent, nat_nocase} ,  
                          {"ignore ? accents: 2-2", 
                           "ignore ? accents: 2-1", 
                           "ignore y accents: 2+0", 
                           "ignore Y accents: 2+1" } }, 
     
-- ?  
    -- codepoint u/FD 253 
    -- utf8 {195,189} 
 
-- ?  
    -- codepoint u/DD 221     
    -- utf8 {195,157} 
 
                  { " Equivalent accented characters (and case) [utf8]",  
                    {nat_noaccent, nat_nocase} ,  
                          {"ignore " & {195,189} & " accents: 2-2", 
                           "ignore " & {195, 157} & " accents: 2-1", 
                           "ignore y accents: 2+0", 
                           "ignore Y accents: 2+1" } }, 
 
 
                { "# Separated ligatures", 
                  { nat_ligature, nat_noaccent, nat_nocase }, 
                  { "\u0132 ligatured ij",  
                    "no ligature" } },  
 
           
                  { "Expand ligatures",  
                    { nat_ligature, nat_noaccent, nat_nocase },  
                            {"Ball","Card","above","aether", 
                           "apple","autumn","au?en","bald", 
                           "car","e-mail","evoke","nina", 
                            "ni?o","?on","?vian","?on"} }, 
     
   
                $} 
--------------------------------------- do sorting 
function clean_esc( sequence s ) 
    -- for nicer output 
    s = transmute(s, 
        { {}, " ", "\r", "\x0c", "\x0b", "\n", "\t" }, 
        { {}, " ",  "\\r",    "\\x0c",    "\\x0b",  "\\n",  "\\t" } ) 
    return s     
end function 
 
    sequence s,t 
    integer ix,iy 
    sequence T 
    sequence orig, default, natural 
     
for rosetta = 1 to length( tests ) do 
    t = tests[rosetta][3] -- unsorted example 
 
    T = sort( t )       -- default sorted  
 
    s = {}  
        for i=1 to length(t) do 
            s = append(s,  { transform( t[i],  tests[rosetta][2]  ), i  } )            
        end for 
        s = sort_columns(s, {1} )  -- nat sorted 
 
    printf(1, "\n   %s\n----------------------------------------------\n", {tests[rosetta][1]} ) 
    printf(1, "   %-25s  %-25s  %s\n\n", { "original", "default sort", "nat sort" } ) 
    for i=1 to length(s) do 
        orig = clean_esc( t[i] ) 
        default = clean_esc( T[i] ) 
        natural = clean_esc( t[s[i][2] ] )     
        {ix,iy} = get_position() puts(1,"\n") 
        position(ix,iy)         printf(1, "|%-s", { orig } )  
        position(ix, iy+25 )    printf(1, "|%-s", { default } )     
        position(ix, iy+50)     printf(1, "|%-s", { natural } ) 
        puts(1, "\n" ) 
    end for 
end for 
 
 
/* 
   # Ignoring leading spaces 
---------------------------------------------- 
   original                   default sort               nat sort 
 
|ignore leading spaces: 2|   ignore leading spaces|  ignore leading spaces: 2+0 
| ignore leading spaces: |  ignore leading spaces:|   ignore leading spaces: 2+1 
|  ignore leading spaces:| ignore leading spaces: | ignore leading spaces: 2-1 
|   ignore leading spaces|ignore leading spaces: 2|ignore leading spaces: 2-2 
 
   Ignore leading spaces 
---------------------------------------------- 
   original                   default sort               nat sort 
 
|    leading spaces: 4   |    leading spaces: 4   | leading spaces: 1 
|   leading spaces: 3    |   leading spaces: 3    |  leading spaces: 2 
|  leading spaces: 2     |  leading spaces: 2     |   leading spaces: 3 
| leading spaces: 1      | leading spaces: 1      |    leading spaces: 4 
 
   # Ignoring multiple adjacent spaces (m.a.s) 
---------------------------------------------- 
   original                   default sort               nat sort 
 
|ignore m.a.s spaces: 2-2|ignore m.a.s    spaces: |ignore m.a.s   spaces: 2+0 
|ignore m.a.s  spaces: 2-|ignore m.a.s   spaces: 2|ignore m.a.s    spaces: 2+1 
|ignore m.a.s   spaces: 2|ignore m.a.s  spaces: 2-|ignore m.a.s  spaces: 2-1 
|ignore m.a.s    spaces: |ignore m.a.s spaces: 2-2|ignore m.a.s spaces: 2-2 
 
   Ignore multiple adjacent spaces 
---------------------------------------------- 
   original                   default sort               nat sort 
 
|adjacent   spaces: 3    |adjacent    spaces: 4   |adjacent spaces: 1 
|adjacent    spaces: 4   |adjacent   spaces: 3    |adjacent  spaces: 2 
|adjacent spaces: 1      |adjacent  spaces: 2     |adjacent   spaces: 3 
|adjacent  spaces: 2     |adjacent spaces: 1      |adjacent    spaces: 4 
 
   # Equivalent whitespace characters 
---------------------------------------------- 
   original                   default sort               nat sort 
 
|Equiv. spaces: 3-3      |Equiv.\tspaces: 3+2     |Equiv.\x0bspaces: 3+0 
|Equiv.\rspaces: 3-2     |Equiv.\nspaces: 3+1     |Equiv.\nspaces: 3+1 
|Equiv.\x0cspaces: 3-1   |Equiv.\x0bspaces: 3+0   |Equiv.\tspaces: 3+2 
|Equiv.\x0bspaces: 3+0   |Equiv.\x0cspaces: 3-1   |Equiv.\x0cspaces: 3-1 
|Equiv.\nspaces: 3+1     |Equiv.\rspaces: 3-2     |Equiv.\rspaces: 3-2 
|Equiv.\tspaces: 3+2     |Equiv. spaces: 3-3      |Equiv. spaces: 3-3 
 
   Equivalent whitespace characters 
---------------------------------------------- 
   original                   default sort               nat sort 
 
|white space: 3-2        |white\tspace: 3+2       |white\x0bspace: 3+0 
|white\rspace: 3-3       |white\nspace: 3+1       |white\nspace: 3+1 
|white\x0cspace: 3-1     |white\x0bspace: 3+0     |white\tspace: 3+2 
|white\x0bspace: 3+0     |white\x0cspace: 3-1     |white\x0cspace: 3-1 
|white\nspace: 3+1       |white\rspace: 3-3       |white space: 3-2 
|white\tspace: 3+2       |white space: 3-2        |white\rspace: 3-3 
 
   # Case Indepenent sort 
---------------------------------------------- 
   original                   default sort               nat sort 
 
|cASE INDEPENENT: 3-2    |cASE INDEPENENT: 3-2    |casE INDEPENENT: 3+0 
|caSE INDEPENENT: 3-1    |caSE INDEPENENT: 3-1    |case INDEPENENT: 3+1 
|casE INDEPENENT: 3+0    |casE INDEPENENT: 3+0    |caSE INDEPENENT: 3-1 
|case INDEPENENT: 3+1    |case INDEPENENT: 3+1    |cASE INDEPENENT: 3-2 
 
   Case independent 
---------------------------------------------- 
   original                   default sort               nat sort 
 
|caSE independent: 3-1   |cASE independent: 3-2   |casE i

1. Comment by _tom 1 month ago

sorry: 'natural'