Pastey Rosetta 'natual' sorting challenge
- Posted by _tom (admin) Oct 19, 2017
-- Rosetta 'natural' sorting challenge -- OE variant of Phix solution -- works without encoding transformations include std/sort.e include std/sequence.e include std/text.e include std/search.e include std/console.e include std/graphics.e function _nat_head( sequence s ) return trim_head( s ) end function integer nat_head = routine_id( "_nat_head" ) -- note that `trim` function cleans head and tail function _nat_adjacent( sequence s ) while match( " ", s ) do s = match_replace( " ", s, " " ) end while return s end function integer nat_adjacent = routine_id( "_nat_adjacent" ) function _nat_space( sequence s ) s = transmute(s, { {}, " ", "\r", "\x0c", "\x0b", "\n", "\t" }, { {}, " ", " ", " ", " ", " ", " " } ) -- { {}, " ", "\\\\r", "\\\x0c", "\\\\x0b", "\\\\n", "\\\\t" } ) return s end function integer nat_space = routine_id( "_nat_space" ) function _nat_nocase( sequence s ) return lower( s ) end function integer nat_nocase = routine_id( "_nat_nocase" ) function _nat_tonumber( sequence s ) sequence res = "" atom prev = ' ' for i=1 to length(s) do if find(s[i],"0123456789") then if length(res)=0 or prev!='0' then res &= s[i] - '0' else res[$] = res[$]*10 + ( s[i] -'0' ) end if prev = '0' else res &= s[i] end if end for return res end function integer nat_tonumber = routine_id( "_nat_tonumber" ) function _nat_title( sequence s ) sequence title = split(s) if length(title) = 1 then return s elsif equal( lower(title[1]), "the") then return join( title[2..$] ) else return s end if end function integer nat_title = routine_id( "_nat_title" ) function _nat_noaccent( sequence s ) sequence accents = { {"?",'A'},{"?",'A'},{"?",'A'},{"?",'A'}, {"?",'A'},{"?",'A'},{"?",'a'},{"?",'a'}, {"?",'a'},{"?",'a'},{"?",'a'},{"?",'a'}, {"?",'C'},{"?",'c'},{"?",'E'},{"?",'E'}, {"?",'E'},{"?",'E'},{"?",'e'},{"?",'e'}, {"?",'e'},{"?",'e'},{"?",'I'},{"?",'I'}, {"?",'I'},{"?",'I'},{"?",'i'},{"?",'i'}, {"?",'i'},{"?",'i'},{"?",'O'},{"?",'O'}, {"?",'O'},{"?",'O'},{"?",'O'},{"?",'O'}, {"?",'o'},{"?",'o'},{"?",'o'},{"?",'o'}, {"?",'o'},{"?",'o'},{"?",'N'},{"?",'n'}, {"?",'U'},{"?",'U'},{"?",'U'},{"?",'U'}, {"?",'u'},{"?",'u'},{"?",'u'},{"?",'u'}, {"?",'Y'},{"?",'y'},{"?",'y'} } sequence left,right {left,right} = columnize( accents ) s = transmute(s, {{}} & left, {{}} & right ) return s end function integer nat_noaccent = routine_id( "_nat_noaccent" ) function _nat_ligature( sequence s ) sequence ligatures = { {"?","AE"},{"?","ae"},{"?","ss"}, {"?","TH"},{"?","th"},{"?","TH"},{"?","th"}, {"ffl", "ffl"}, {"ffi", "ffi"}, {"fi", "fi"}, {"ff", "ff"}, {"fl", "fl"}, {"ſ", "s"}, {"ʒ", "z"}, {"st", "st"} } sequence left,right {left,right} = columnize( ligatures ) s = transmute(s, {{}} & left, {{}} & right ) return s end function integer nat_ligature = routine_id( "_nat_ligature" ) sequence tests = { { "# Ignoring leading spaces", -- description { nat_head }, -- naturalization function { "ignore leading spaces: 2-2", -- test data " ignore leading spaces: 2-1", " ignore leading spaces: 2+0", " ignore leading spaces: 2+1"} }, { "Ignore leading spaces", { nat_head } , {" leading spaces: 4", " leading spaces: 3", " leading spaces: 2", " leading spaces: 1" } }, { "# Ignoring multiple adjacent spaces (m.a.s)", { nat_adjacent }, { "ignore m.a.s spaces: 2-2", "ignore m.a.s spaces: 2-1", "ignore m.a.s spaces: 2+0", "ignore m.a.s spaces: 2+1" } }, { "Ignore multiple adjacent spaces", { nat_adjacent }, {"adjacent spaces: 3", "adjacent spaces: 4", "adjacent spaces: 1", "adjacent spaces: 2"} }, { "# Equivalent whitespace characters", { nat_space }, { "Equiv. spaces: 3-3", "Equiv.\rspaces: 3-2", "Equiv.\x0cspaces: 3-1", "Equiv.\x0bspaces: 3+0", "Equiv.\nspaces: 3+1", "Equiv.\tspaces: 3+2"} }, { "Equivalent whitespace characters", { nat_space }, {"white space: 3-2", "white\rspace: 3-3", "white\x0cspace: 3-1", "white\x0bspace: 3+0", "white\nspace: 3+1", "white\tspace: 3+2"}}, { "# Case Indepenent sort", { nat_nocase }, { "cASE INDEPENENT: 3-2", "caSE INDEPENENT: 3-1", "casE INDEPENENT: 3+0", "case INDEPENENT: 3+1" } }, { "Case independent", { nat_nocase}, {"caSE independent: 3-1", "cASE independent: 3-2", "casE independent: 3+0", "case independent: 3+1"} }, { "# Numeric fields as numerics", {nat_tonumber}, { "foo100bar99baz0.txt", "foo100bar10baz0.txt", "foo1000bar99baz10.txt", "foo1000bar99baz9.txt" } }, { "Numeric fields as numerics", {nat_tonumber}, { "foo1000bar99baz9.txt", "foo100bar99baz0.txt", "foo100bar10baz0.txt", "foo1000bar99baz10.txt"} }, { "Numeric fields as numerics", {nat_tonumber}, { "foo1bar", "foo100bar", "foo bar", "foo1000bar" } }, { "# Title sorts", {nat_title}, { "The Wind in the Willows", "The 40th step more", "The 39 steps", "Wanda" } }, { "Ignore leading 'the'", {nat_title}, {"The Wind in the Willows", "The 40th step more", "The 39 steps", "Wanda" } }, { " Equivalent accented characters (and case) [literal]", {nat_noaccent, nat_nocase} , {"ignore ? accents: 2-2", "ignore ? accents: 2-1", "ignore y accents: 2+0", "ignore Y accents: 2+1" } }, -- ? -- codepoint u/FD 253 -- utf8 {195,189} -- ? -- codepoint u/DD 221 -- utf8 {195,157} { " Equivalent accented characters (and case) [utf8]", {nat_noaccent, nat_nocase} , {"ignore " & {195,189} & " accents: 2-2", "ignore " & {195, 157} & " accents: 2-1", "ignore y accents: 2+0", "ignore Y accents: 2+1" } }, { "# Separated ligatures", { nat_ligature, nat_noaccent, nat_nocase }, { "\u0132 ligatured ij", "no ligature" } }, { "Expand ligatures", { nat_ligature, nat_noaccent, nat_nocase }, {"Ball","Card","above","aether", "apple","autumn","au?en","bald", "car","e-mail","evoke","nina", "ni?o","?on","?vian","?on"} }, $} --------------------------------------- do sorting function clean_esc( sequence s ) -- for nicer output s = transmute(s, { {}, " ", "\r", "\x0c", "\x0b", "\n", "\t" }, { {}, " ", "\\r", "\\x0c", "\\x0b", "\\n", "\\t" } ) return s end function sequence s,t integer ix,iy sequence T sequence orig, default, natural for rosetta = 1 to length( tests ) do t = tests[rosetta][3] -- unsorted example T = sort( t ) -- default sorted s = {} for i=1 to length(t) do s = append(s, { transform( t[i], tests[rosetta][2] ), i } ) end for s = sort_columns(s, {1} ) -- nat sorted printf(1, "\n %s\n----------------------------------------------\n", {tests[rosetta][1]} ) printf(1, " %-25s %-25s %s\n\n", { "original", "default sort", "nat sort" } ) for i=1 to length(s) do orig = clean_esc( t[i] ) default = clean_esc( T[i] ) natural = clean_esc( t[s[i][2] ] ) {ix,iy} = get_position() puts(1,"\n") position(ix,iy) printf(1, "|%-s", { orig } ) position(ix, iy+25 ) printf(1, "|%-s", { default } ) position(ix, iy+50) printf(1, "|%-s", { natural } ) puts(1, "\n" ) end for end for /* # Ignoring leading spaces ---------------------------------------------- original default sort nat sort |ignore leading spaces: 2| ignore leading spaces| ignore leading spaces: 2+0 | ignore leading spaces: | ignore leading spaces:| ignore leading spaces: 2+1 | ignore leading spaces:| ignore leading spaces: | ignore leading spaces: 2-1 | ignore leading spaces|ignore leading spaces: 2|ignore leading spaces: 2-2 Ignore leading spaces ---------------------------------------------- original default sort nat sort | leading spaces: 4 | leading spaces: 4 | leading spaces: 1 | leading spaces: 3 | leading spaces: 3 | leading spaces: 2 | leading spaces: 2 | leading spaces: 2 | leading spaces: 3 | leading spaces: 1 | leading spaces: 1 | leading spaces: 4 # Ignoring multiple adjacent spaces (m.a.s) ---------------------------------------------- original default sort nat sort |ignore m.a.s spaces: 2-2|ignore m.a.s spaces: |ignore m.a.s spaces: 2+0 |ignore m.a.s spaces: 2-|ignore m.a.s spaces: 2|ignore m.a.s spaces: 2+1 |ignore m.a.s spaces: 2|ignore m.a.s spaces: 2-|ignore m.a.s spaces: 2-1 |ignore m.a.s spaces: |ignore m.a.s spaces: 2-2|ignore m.a.s spaces: 2-2 Ignore multiple adjacent spaces ---------------------------------------------- original default sort nat sort |adjacent spaces: 3 |adjacent spaces: 4 |adjacent spaces: 1 |adjacent spaces: 4 |adjacent spaces: 3 |adjacent spaces: 2 |adjacent spaces: 1 |adjacent spaces: 2 |adjacent spaces: 3 |adjacent spaces: 2 |adjacent spaces: 1 |adjacent spaces: 4 # Equivalent whitespace characters ---------------------------------------------- original default sort nat sort |Equiv. spaces: 3-3 |Equiv.\tspaces: 3+2 |Equiv.\x0bspaces: 3+0 |Equiv.\rspaces: 3-2 |Equiv.\nspaces: 3+1 |Equiv.\nspaces: 3+1 |Equiv.\x0cspaces: 3-1 |Equiv.\x0bspaces: 3+0 |Equiv.\tspaces: 3+2 |Equiv.\x0bspaces: 3+0 |Equiv.\x0cspaces: 3-1 |Equiv.\x0cspaces: 3-1 |Equiv.\nspaces: 3+1 |Equiv.\rspaces: 3-2 |Equiv.\rspaces: 3-2 |Equiv.\tspaces: 3+2 |Equiv. spaces: 3-3 |Equiv. spaces: 3-3 Equivalent whitespace characters ---------------------------------------------- original default sort nat sort |white space: 3-2 |white\tspace: 3+2 |white\x0bspace: 3+0 |white\rspace: 3-3 |white\nspace: 3+1 |white\nspace: 3+1 |white\x0cspace: 3-1 |white\x0bspace: 3+0 |white\tspace: 3+2 |white\x0bspace: 3+0 |white\x0cspace: 3-1 |white\x0cspace: 3-1 |white\nspace: 3+1 |white\rspace: 3-3 |white space: 3-2 |white\tspace: 3+2 |white space: 3-2 |white\rspace: 3-3 # Case Indepenent sort ---------------------------------------------- original default sort nat sort |cASE INDEPENENT: 3-2 |cASE INDEPENENT: 3-2 |casE INDEPENENT: 3+0 |caSE INDEPENENT: 3-1 |caSE INDEPENENT: 3-1 |case INDEPENENT: 3+1 |casE INDEPENENT: 3+0 |casE INDEPENENT: 3+0 |caSE INDEPENENT: 3-1 |case INDEPENENT: 3+1 |case INDEPENENT: 3+1 |cASE INDEPENENT: 3-2 Case independent ---------------------------------------------- original default sort nat sort |caSE independent: 3-1 |cASE independent: 3-2 |casE i
1. Comment by _tom Oct 19, 2017
sorry: 'natural'