### Pastey Rosetta 'natual' sorting challenge

```-- Rosetta 'natural' sorting challenge
-- OE variant of Phix solution
-- works without encoding transformations

include std/sort.e
include std/sequence.e
include std/text.e
include std/search.e
include std/console.e
include std/graphics.e

end function
-- note that `trim` function cleans head and tail

while match( "  ", s ) do
s = match_replace( "  ", s, " " )
end while
return s
end function

function _nat_space( sequence s )
s = transmute(s,
{ {}, " ", "\r", "\x0c", "\x0b", "\n", "\t" },
{ {}, " ",  " ",    " ",    " ",  " ",  " " } )
-- { {}, " ",  "\\\\r",    "\\\x0c",    "\\\\x0b",  "\\\\n",  "\\\\t" } )
return s
end function
integer nat_space = routine_id( "_nat_space" )

function _nat_nocase( sequence s )
return lower( s )
end function
integer nat_nocase = routine_id( "_nat_nocase" )

function _nat_tonumber( sequence s )
sequence res = ""
atom prev = ' '
for i=1 to length(s) do
if find(s[i],"0123456789") then
if length(res)=0 or prev!='0' then
res &= s[i] - '0'
else
res[\$] = res[\$]*10 + ( s[i] -'0' )
end if
prev = '0'
else
res &= s[i]
end if
end for
return res
end function
integer nat_tonumber = routine_id( "_nat_tonumber" )

function _nat_title( sequence s )
sequence title = split(s)
if length(title) = 1 then
return s
elsif  equal( lower(title[1]), "the") then
return join( title[2..\$] )
else
return s
end if
end function
integer nat_title = routine_id( "_nat_title" )

function _nat_noaccent( sequence s )
sequence accents = {
{"?",'A'},{"?",'A'},{"?",'A'},{"?",'A'},
{"?",'A'},{"?",'A'},{"?",'a'},{"?",'a'},
{"?",'a'},{"?",'a'},{"?",'a'},{"?",'a'},
{"?",'C'},{"?",'c'},{"?",'E'},{"?",'E'},
{"?",'E'},{"?",'E'},{"?",'e'},{"?",'e'},
{"?",'e'},{"?",'e'},{"?",'I'},{"?",'I'},
{"?",'I'},{"?",'I'},{"?",'i'},{"?",'i'},
{"?",'i'},{"?",'i'},{"?",'O'},{"?",'O'},
{"?",'O'},{"?",'O'},{"?",'O'},{"?",'O'},
{"?",'o'},{"?",'o'},{"?",'o'},{"?",'o'},
{"?",'o'},{"?",'o'},{"?",'N'},{"?",'n'},
{"?",'U'},{"?",'U'},{"?",'U'},{"?",'U'},
{"?",'u'},{"?",'u'},{"?",'u'},{"?",'u'},
{"?",'Y'},{"?",'y'},{"?",'y'}
}
sequence left,right
{left,right} = columnize( accents )
s = transmute(s, {{}} & left, {{}} & right )
return s
end function
integer nat_noaccent = routine_id( "_nat_noaccent" )

function _nat_ligature( sequence s )
sequence ligatures = {
{"?","AE"},{"?","ae"},{"?","ss"},
{"?","TH"},{"?","th"},{"?","TH"},{"?","th"},
{"ﬄ", "ffl"}, {"ﬃ", "ffi"}, {"ﬁ", "fi"}, {"ﬀ", "ff"}, {"ﬂ", "fl"},
{"ſ", "s"}, {"ʒ", "z"}, {"ﬆ", "st"}
}
sequence left,right
{left,right} = columnize( ligatures )
s = transmute(s, {{}} & left, {{}} & right )
return s
end function
integer nat_ligature = routine_id( "_nat_ligature" )

sequence tests = {
{ "# Ignoring leading spaces",        -- description
{ nat_head },                       -- naturalization function
{ "ignore leading spaces: 2-2",     -- test data
"   ignore leading spaces: 2+1"} },

" leading spaces: 1" } },

{ "# Ignoring multiple adjacent spaces (m.a.s)",
{ "ignore m.a.s spaces: 2-2",
"ignore m.a.s  spaces: 2-1",
"ignore m.a.s   spaces: 2+0",
"ignore m.a.s    spaces: 2+1" } },

{ "# Equivalent whitespace characters",
{ nat_space },
{ "Equiv. spaces: 3-3",
"Equiv.\rspaces: 3-2",
"Equiv.\x0cspaces: 3-1",
"Equiv.\x0bspaces: 3+0",
"Equiv.\nspaces: 3+1",
"Equiv.\tspaces: 3+2"} },

{ "Equivalent whitespace characters",
{ nat_space },
{"white space: 3-2",
"white\rspace: 3-3",
"white\x0cspace: 3-1",
"white\x0bspace: 3+0",
"white\nspace: 3+1",
"white\tspace: 3+2"}},

{ "# Case Indepenent sort",
{ nat_nocase },
{ "cASE INDEPENENT: 3-2",
"caSE INDEPENENT: 3-1",
"casE INDEPENENT: 3+0",
"case INDEPENENT: 3+1" } },

{ "Case independent",
{ nat_nocase},
{"caSE independent: 3-1",
"cASE independent: 3-2",
"casE independent: 3+0",
"case independent: 3+1"} },

{ "# Numeric fields as numerics",
{nat_tonumber},
{ "foo100bar99baz0.txt",
"foo100bar10baz0.txt",
"foo1000bar99baz10.txt",
"foo1000bar99baz9.txt" } },

{ "Numeric fields as numerics",
{nat_tonumber},
{ "foo1000bar99baz9.txt",
"foo100bar99baz0.txt",
"foo100bar10baz0.txt",
"foo1000bar99baz10.txt"} },

{ "Numeric fields as numerics",
{nat_tonumber},
{  "foo1bar",
"foo100bar",
"foo bar",
"foo1000bar" } },

{ "# Title sorts",
{nat_title},
{ "The Wind in the Willows",
"The 40th step more",
"The 39 steps",
"Wanda" } },

{nat_title},
{"The Wind in the Willows",
"The 40th step more",
"The 39 steps",
"Wanda" } },

{ " Equivalent accented characters (and case) [literal]",
{nat_noaccent, nat_nocase} ,
{"ignore ? accents: 2-2",
"ignore ? accents: 2-1",
"ignore y accents: 2+0",
"ignore Y accents: 2+1" } },

-- ?
-- codepoint u/FD 253
-- utf8 {195,189}

-- ?
-- codepoint u/DD 221
-- utf8 {195,157}

{ " Equivalent accented characters (and case) [utf8]",
{nat_noaccent, nat_nocase} ,
{"ignore " & {195,189} & " accents: 2-2",
"ignore " & {195, 157} & " accents: 2-1",
"ignore y accents: 2+0",
"ignore Y accents: 2+1" } },

{ "# Separated ligatures",
{ nat_ligature, nat_noaccent, nat_nocase },
{ "\u0132 ligatured ij",
"no ligature" } },

{ "Expand ligatures",
{ nat_ligature, nat_noaccent, nat_nocase },
{"Ball","Card","above","aether",
"apple","autumn","au?en","bald",
"car","e-mail","evoke","nina",
"ni?o","?on","?vian","?on"} },

\$}
--------------------------------------- do sorting
function clean_esc( sequence s )
-- for nicer output
s = transmute(s,
{ {}, " ", "\r", "\x0c", "\x0b", "\n", "\t" },
{ {}, " ",  "\\r",    "\\x0c",    "\\x0b",  "\\n",  "\\t" } )
return s
end function

sequence s,t
integer ix,iy
sequence T
sequence orig, default, natural

for rosetta = 1 to length( tests ) do
t = tests[rosetta][3] -- unsorted example

T = sort( t )       -- default sorted

s = {}
for i=1 to length(t) do
s = append(s,  { transform( t[i],  tests[rosetta][2]  ), i  } )
end for
s = sort_columns(s, {1} )  -- nat sorted

printf(1, "\n   %s\n----------------------------------------------\n", {tests[rosetta][1]} )
printf(1, "   %-25s  %-25s  %s\n\n", { "original", "default sort", "nat sort" } )
for i=1 to length(s) do
orig = clean_esc( t[i] )
default = clean_esc( T[i] )
natural = clean_esc( t[s[i][2] ] )
{ix,iy} = get_position() puts(1,"\n")
position(ix,iy)         printf(1, "|%-s", { orig } )
position(ix, iy+25 )    printf(1, "|%-s", { default } )
position(ix, iy+50)     printf(1, "|%-s", { natural } )
puts(1, "\n" )
end for
end for

/*
----------------------------------------------
original                   default sort               nat sort

----------------------------------------------
original                   default sort               nat sort

# Ignoring multiple adjacent spaces (m.a.s)
----------------------------------------------
original                   default sort               nat sort

|ignore m.a.s spaces: 2-2|ignore m.a.s    spaces: |ignore m.a.s   spaces: 2+0
|ignore m.a.s  spaces: 2-|ignore m.a.s   spaces: 2|ignore m.a.s    spaces: 2+1
|ignore m.a.s   spaces: 2|ignore m.a.s  spaces: 2-|ignore m.a.s  spaces: 2-1
|ignore m.a.s    spaces: |ignore m.a.s spaces: 2-2|ignore m.a.s spaces: 2-2

----------------------------------------------
original                   default sort               nat sort

# Equivalent whitespace characters
----------------------------------------------
original                   default sort               nat sort

|Equiv. spaces: 3-3      |Equiv.\tspaces: 3+2     |Equiv.\x0bspaces: 3+0
|Equiv.\rspaces: 3-2     |Equiv.\nspaces: 3+1     |Equiv.\nspaces: 3+1
|Equiv.\x0cspaces: 3-1   |Equiv.\x0bspaces: 3+0   |Equiv.\tspaces: 3+2
|Equiv.\x0bspaces: 3+0   |Equiv.\x0cspaces: 3-1   |Equiv.\x0cspaces: 3-1
|Equiv.\nspaces: 3+1     |Equiv.\rspaces: 3-2     |Equiv.\rspaces: 3-2
|Equiv.\tspaces: 3+2     |Equiv. spaces: 3-3      |Equiv. spaces: 3-3

Equivalent whitespace characters
----------------------------------------------
original                   default sort               nat sort

|white space: 3-2        |white\tspace: 3+2       |white\x0bspace: 3+0
|white\rspace: 3-3       |white\nspace: 3+1       |white\nspace: 3+1
|white\x0cspace: 3-1     |white\x0bspace: 3+0     |white\tspace: 3+2
|white\x0bspace: 3+0     |white\x0cspace: 3-1     |white\x0cspace: 3-1
|white\nspace: 3+1       |white\rspace: 3-3       |white space: 3-2
|white\tspace: 3+2       |white space: 3-2        |white\rspace: 3-3

# Case Indepenent sort
----------------------------------------------
original                   default sort               nat sort

|cASE INDEPENENT: 3-2    |cASE INDEPENENT: 3-2    |casE INDEPENENT: 3+0
|caSE INDEPENENT: 3-1    |caSE INDEPENENT: 3-1    |case INDEPENENT: 3+1
|casE INDEPENENT: 3+0    |casE INDEPENENT: 3+0    |caSE INDEPENENT: 3-1
|case INDEPENENT: 3+1    |case INDEPENENT: 3+1    |cASE INDEPENENT: 3-2

Case independent
----------------------------------------------
original                   default sort               nat sort

|caSE independent: 3-1   |cASE independent: 3-2   |casE i```

sorry: 'natural'