Pastey Mike Sort
- Posted by _tom (admin) Jul 19, 2017
-- This module manages a general purpose sorting system -- Data can be sorted directly, as an index - either optionally by column(s). Or by a custom routine --sort(data [,col]) --sort_index(index, keys [,col]) --custom_sort(s, rtn) -------------------- natural order sorting -- compares any two objects using the natural order algorithm export function nat_compare(object x1, object x2) integer x1pt, x2pt, x1len, x2len, number, comp, String atom x1num, x2num object x1char, x2char if sequence(x1) and sequence(x2) then x1len = length(x1) x2len = length(x2) -- if both are strings -- inline String type-checking for speed String = 1 for i = 1 to x1len do x1char = x1[i] if not integer(x1char) or x1char > 255 or (x1char < ' ' and not find(x1char,"\t\n\r")) then -- not a String String = 0 exit end if end for if String then String = 1 for i = 1 to x2len do x2char = x2[i] if not integer(x2char) or x2char > 255 or (x2char < ' ' and not find(x2char,"\t\n\r")) then -- not a String String = 0 exit end if end for end if if not String then -- recurse for i = 1 to x1len do if i > x2len then return 1 end if comp = nat_compare(x1[i],x2[i]) if comp != 0 then return comp end if end for return compare(x1len,x2len) end if -- both are strings x1pt = 1 x2pt = 1 -- number flag number = 0 while x1pt <= x1len do if number then -- build numbers x1num = x1char - '0' while x1pt < x1len do x1pt += 1 x1char = x1[x1pt] if (x1char >= '0' and x1char <= '9') then x1num = (x1num * 10) + (x1char - '0') else x1pt -= 1 exit end if end while x2num = x2char - '0' while x2pt < x2len do x2pt += 1 x2char = x2[x2pt] if (x2char >= '0' and x2char <= '9') then x2num = (x2num * 10) + (x2char - '0') else x2pt -= 1 exit end if end while -- compare as atoms comp = compare(x1num,x2num) if comp != 0 then return comp end if -- still equal, keep going number = 0 x1pt += 1 x2pt += 1 else if x2pt > x2len then -- equal up to point x2 stops -- x1 is longer, therefore greater return 1 end if x1char = x1[x1pt] x2char = x2[x2pt] if (x1char >= '0' and x1char <= '9') and (x2char >= '0' and x2char <= '9') then -- first digit of number found, set flag number = 1 else comp = compare(x1char,x2char) if comp != 0 then return comp end if x1pt += 1 x2pt += 1 end if end if end while -- equal up to point x1 stops, longer (if either) is greater return compare(x1len,x2len) else -- one or both top-level elements are atoms -- compare normally return compare(x1,x2) end if end function export constant NAT = routine_id( "nat_compare" ) -------------------------- mike sort integer SortType object Column sequence Keys integer CompareRtn constant DIRECT=1, COLUMN=2, COLUMNS=3, INDEX=4, INDEXCOLUMNS=5, CUSTOM=6 procedure init() SortType = 0 Column = 0 Keys = {} CompareRtn = -1 end procedure function IsIndex( sequence s, object sortby ) -- ignore this case - sortby MUST BE a sequence for index sorting if not sequence(sortby) then return 0 end if -- check if is sequence of integers for i = 1 to length(s) do if not integer( s[i] ) then return 0 -- early out end if end for -- all elements integers at this point -- find the min/max integer Min = s[1] integer Max = Min for i = 2 to length(s) do integer x = s[i] if x > Max then Max = x elsif x < Min then Min = x end if end for -- check min/max integer try = Min >= 1 and Max <= length(sortby) -- exit return try end function function compare_columns(sequence a, sequence b, sequence cols) integer cmp, col -- loop through all columns for i = length(cols) to 1 by -1 do -- get the next column number col = cols[i] -- do the compare cmp = compare(a[col], b[col]) -- early out? if cmp then exit end if end for -- exit with the result return cmp end function function Kompare(object a, object b) if SortType = DIRECT then return compare(a, b) elsif SortType = COLUMN then return compare(a[Column], b[Column]) elsif SortType = COLUMNS then return compare_columns(a, b, Column) elsif SortType = INDEX then return compare(Keys[a], Keys[b]) elsif SortType = INDEXCOLUMNS then return compare_columns(Keys[a], Keys[b], Column) elsif SortType = CUSTOM then return call_func(CompareRtn, {a, b}) else ? 9/0 -- DEV error end if end function function insertion_sortXX(sequence x) -- insertion sort on steroids object temp integer insert, hi, mid -- loop along UNsorted list for i = 2 to length(x) do -- get the first element in the unsorted section temp = x[i] insert = 1 -- start of sorted section hi = i-1 -- end of sorted section -- find the insertion point -- binary search? if i > 15 then while insert <= hi do -- starts at top of sorted list and go to the bottom mid = floor((insert + hi) / 2) if Kompare(temp, x[mid]) < 0 then hi = mid - 1 else insert = mid + 1 end if end while -- linear search? else while hi do if Kompare( temp, x[hi] ) >= 0 then insert = hi+1 exit end if hi -= 1 end while end if -- shuffle & insert, but only if needed if insert < i then x[insert+1 .. i] = x[insert .. i-1] x[insert] = temp end if end for -- exit return x end function function merge_sort(sequence x) -- put x into ascending order using recursive merge sort -- best for medium to large sequences -- optimized with insertion_sort for the shortest sequences -- importantly, it is a *stable* sort -- init integer n = length(x) -- fast sort for small sequences if n <= 100 then return insertion_sortXX(x) end if -- recurse integer mid = floor(n / 2) sequence a = merge_sort(x[1 .. mid]) -- sort the first half sequence b = merge_sort(x[mid + 1 .. n]) -- sort the second half -- merge the two sorted halves into one sequence merged = {} while length(a) and length(b) do if Kompare(a[1], b[1]) <= 0 then -- this is the KEY to stability since the earlier 'a' element always leads when keys are equal merged = append(merged, a[1]) a = a[2 .. $] else merged = append(merged, b[1]) b = b[2 .. $] end if end while -- exit with merged data + remnants if length(a) then return merged & a end if return merged & b end function function go(sequence data) -- return var sequence ret -- fast sort? if length(data) < 200 then ret = insertion_sortXX( data ) -- std sort using merge sort else ret = merge_sort( data ) end if -- exit return ret end function function get_col(sequence s, integer col) -- init integer len = length(s) sequence ret = repeat(0, len) -- pull out the column for i = 1 to len do ret[i] = s[i][col] end for -- exit return ret end function function optimise(object col) if sequence(col) and length(col) = 1 then col = col[1] end if return col end function global function sort(sequence data, object col=0) -- early out? if not length(data) then return data end if -- init init() -- get column(s) Column = optimise(col) -- columns if sequence(col) then SortType = COLUMNS -- direct elsif not col then SortType = DIRECT -- column else SortType = COLUMN end if -- sort return go(data) end function global function sort_index( sequence index, sequence sortby, object col=0 ) -- early out? if not length(index) then return index end if -- check that really is an index if not IsIndex( index, sortby ) then puts(1, "Bad data sent to sort_index()\n") return index -- user error, should we provoke a crash? end if -- init init() -- get column(s) Column = optimise(col) -- work out the type & keys if sequence(Column) then SortType = INDEXCOLUMNS Keys = sortby elsif Column then SortType = INDEX Keys = get_col(sortby, Column) else SortType = INDEX Keys = sortby end if -- sort return go( index ) end function global function custom_sort( sequence data, integer rtn ) -- early out? if not length(data) then return data end if -- init SortType = CUSTOM Column = 0 Keys = data CompareRtn = rtn -- sort return go(data) end function ----------- testing procedure show( sequence title, sequence data ) printf(1, "\n%s\n", {title} ) for i=1 to length(data) do printf(1, "%s\n", {data[i]} ) end for end procedure sequence data = {"1A","Item10","Item1A","Item100","Item20","Item1","100","Item10B","10A"} show( "unsorted", data ) sequence number = sort(data) show( "`number order` sorted", number ) sequence natural = custom_sort( data, NAT ) show( "`natural order` sorted", natural )
1. Comment by _tom Jul 20, 2017
Note that the original "natural sorting" includes a second comparison for objects known to be string sequences -- not included here to save space. _tom