Re: help with storing user input

new topic     » goto parent     » topic index » view thread      » older message » newer message

----- Original Message -----
From: "Jason Dube" <dubetyrant at hotmail.com>
To: "EUforum" <EUforum at topica.com>
Subject: help with storing user input


>
> Hello,
> What would be an efficient way of seperating words in a user inputted
sentence? For example to break apart the words in a sentence. Im
specifically trying to develop a GOOD algoritm to seperate words from user
inputed sentence and store them as individual sequences.Like:
> user input:"mary had a little lamb"
> results:sequence 1st_sentence={"mary","had","a","little","lamb")
> Im having difficulties skipping whitespaces and converting to string
>
> how would euphoria do this:?
Here are a couple of routines that I use...


-------------------------------------
global function Tokenize(sequence pText, object pWhiteSpace, object
pNonword,
                                          object pQuotes) --> sequence
-------------------------------------
-- pText is returned as a sequence of 'words'.
-- Each word is delimited by a set of one or more Delimiters


    sequence lTokens
    integer lStartQuote, lEndQuote
    integer lTextLength
    integer lStart
    integer lPos

    -- Validate whitespace parameter
    if atom(pWhiteSpace) then
        if pWhiteSpace = 0 then
            pWhiteSpace = ' ' & 8 & 9 & 10 & 11 & 12 & 13
        else
            pWhiteSpace = {pWhiteSpace}
        end if
    end if

    -- Validate non-word parameter
    if atom(pNonword) then
        if pNonword = 0 then
            pNonword = "`~!@#$%^&*()_-+={[}]|\\:;\"'<,>.?/"
        else
            pNonword = {pNonword}
        end if
    end if

    -- Validate quote marks parameter
    if     sequence(pQuotes) then
        if length(pQuotes) = 0 then
            pQuotes = {{},{},{},{},{}}
        elsif (length(pQuotes) != 5
                or
               atom(pQuotes[1])
                or
               atom(pQuotes[2])
                or
               atom(pQuotes[3])
                or
               length(pQuotes[1]) != length(pQuotes[2])
                or
               atom(pQuotes[4])
                or
               atom(pQuotes[5])
                or
               length(pQuotes[4]) != length(pQuotes[5])
            )
        then
            pQuotes = 0
        end if
    end if

    if atom(pQuotes) then
        if pQuotes = 0 then
            pQuotes = {"\"'`", "\"'`", "\\~","",""}
        else
            pQuotes = {{pQuotes}, {pQuotes},{},{},{}}
        end if
    end if

    -- Initialize
    lTokens = {}
    lStart = 0
    lStartQuote = 0
    lEndQuote = 0
    for i = 1 to length(pText) do
        if lStartQuote != 0 then
            if pText[i] = lEndQuote then
                if find(pText[i - 1], pQuotes[3]) then
                    if i > 2 and find(pText[i - 2], pQuotes[3]) then
                        lTokens = append(lTokens, pText[lStart .. i - 1])
                        lStart = 0
                        lStartQuote = 0
                        lEndQuote = 0
                    end if
                 else
                    lTokens = append(lTokens, pText[lStart .. i - 1])
                    lStart = 0
                    lStartQuote = 0
                    lEndQuote = 0
                 end if
            end if
        else
            lPos =  find(pText[i], pQuotes[1])
            if lPos != 0 then
                lStartQuote = lPos
                lStart = i + 1
                lEndQuote = pQuotes[2][lPos]
            elsif find( pText[i], pWhiteSpace ) then
                if lStart != 0
                then
                    lTokens = append(lTokens, pText[lStart .. i - 1])
                    lStart = 0
                end if
            else
                if lStart = 0
                then
                    lStart = i
                end if

                if find(pText[i], pNonword) > 0
                then
                    if lStart != 0
                    then
                        -- Avoid empty tokens
                        if lStart != i then
                            lTokens = append(lTokens, pText[lStart .. i -
1])
                        end if
                        lStart = 0
                    end if

                    lTokens = append(lTokens, {pText[i]})
                    lStart = 0
                end if
            end if
        end if
    end for

    if lStart != 0
    then
        lTokens = append(lTokens, pText[lStart .. length(pText)])
        lStart = 0
    end if

    return lTokens
end function

-------------------------------------
global function SimpleTokenize(sequence s, object c)
-------------------------------------
-- Returns 's', as a number of words delimited by one or more 'c' objects
    integer slen, spt, i
    sequence parsed

    parsed = {}
    slen = length(s)
    spt = 1

    i = 1
    while i <= slen do
        while i <= slen and equal(s[i], c) do
            i += 1
        end while
        spt = i
        while i <= slen and not equal(s[i],c) do
            i += 1
        end while
        parsed = append(parsed,s[spt..i-1])
        i += 1
    end while

    return parsed
end function


----------------
cheers,
Derek Parnell

new topic     » goto parent     » topic index » view thread      » older message » newer message

Search



Quick Links

User menu

Not signed in.

Misc Menu