Parsing

new topic     » goto parent     » topic index » view thread      » older message » newer message

Joe wrote:

> I should know how to do this, but I don't. Could some one show me an
> example of parsing stuff read in from a file. For example sake an
> HTML file, and reading any <P>, <BR>, or <B> tags. Thanks.

Here's a program that will read an HTML file, and return a sequence of token
in the form:

   { { tokenType, tokenValue } ... }

 -- CODE BEGINS HERE


 -- html.ex
 -- simple HTML parser

integer tag
sequence buffer, parse, text


 -- token types
constant    STRING  = 1,    -- string of text
            TAG     = 2     -- html tag


procedure if_err( object test, sequence errMessage )

    -- generic error handler
    -- if test is true, abort with message

    if test then
        puts( 1, errMessage & '\n' )
        abort( 0 )
    end if

end procedure


function read_file( sequence fName )

    -- see help file under 'gets()'
    -- read file fName, return as sequence

    atom handle
    sequence buffer
    object line


    -- open file
        handle = open( fName, "r" )
        if_err( handle = -1, "Unable to open file " & fName & "." )

    -- clear buffer
        buffer = {}

    -- read until end of file
    while 1 do
        line = gets(handle)
        if atom(line) then
            exit   -- end of file
        else
            buffer = append(buffer, line)
        end if
    end while
    close( handle )

    return buffer

end function


 -- read the file
    buffer = read_file( "test.htm" )

 -- parse the file
    parse = ""
    for line = 1 to length( buffer ) do
        -- clear tag
        tag = 0

        -- clear accumulated text
        text = ""

        for char = 1 to length( buffer[line] ) do

            -- start of html tag
            if buffer[line][char] = '<' then

                -- save accumulated text
                if length( text ) > 0 then
                    parse = append( parse, { STRING, text } )
                end if

                -- inside of tag already?
                if_err( tag != 0, "Error - unexpected '<' in tag." )

                -- start of tag
                tag = 1
                text = ""

            -- end of html tag
            elsif buffer[line][char] = '>' then

                -- was a tag started?
                if_err( tag = 0, "Error - unexpected '>'.\n" )

                -- write tag
                parse = append( parse, { TAG, text } )
                text = ""

                -- clear flag
                tag = 0

            -- end of line
            elsif buffer[line][char] = '\n' then

                -- was tag started?
                if_err( tag, "Error - unexpected end of line in tag.\n" )

                -- text accumulated?
                if length( text ) > 0 then
                    parse = append( parse, { STRING, text } )
                end if

                -- clear text
                text = ""

            -- normal character
            else

                -- accumulate text
                text = text & buffer[line][char]

            end if
        end for
    end for


 -- show results of parse

    -- each token
    for i = 1 to length( parse ) do

        -- show results, based on type of token
        if parse[i][1] = STRING then
            printf( 1, "STRING: %s\n", {parse[i][2]} )

        elsif parse[i][1] = TAG then
            printf( 1, "TAG   : %s\n", {parse[i][2]} )

        else
            if_err( 1, "Unknown token." )

        end if
    end for

 -- END OF CODE

Here's a test file:

 -- TEST FILE BEGINS HERE
<B>This is bold<\B>
<I>This is italic<\I>
<P>This is a new paragraph.
<BR>This is a paragraph break.
<B>This is bold <I>and italic<\I><\B>
 -- TEST FILE ENDS HERE

Hope this helps.

 -- David Cuny

new topic     » goto parent     » topic index » view thread      » older message » newer message

Search



Quick Links

User menu

Not signed in.

Misc Menu