Parsing
- Posted by "Cuny, David" <ATB.DCUNY at HW1.CAHWNET.GOV> Apr 07, 1997
- 823 views
Joe wrote: > I should know how to do this, but I don't. Could some one show me an > example of parsing stuff read in from a file. For example sake an > HTML file, and reading any <P>, <BR>, or <B> tags. Thanks. Here's a program that will read an HTML file, and return a sequence of token in the form: { { tokenType, tokenValue } ... } -- CODE BEGINS HERE -- html.ex -- simple HTML parser integer tag sequence buffer, parse, text -- token types constant STRING = 1, -- string of text TAG = 2 -- html tag procedure if_err( object test, sequence errMessage ) -- generic error handler -- if test is true, abort with message if test then puts( 1, errMessage & '\n' ) abort( 0 ) end if end procedure function read_file( sequence fName ) -- see help file under 'gets()' -- read file fName, return as sequence atom handle sequence buffer object line -- open file handle = open( fName, "r" ) if_err( handle = -1, "Unable to open file " & fName & "." ) -- clear buffer buffer = {} -- read until end of file while 1 do line = gets(handle) if atom(line) then exit -- end of file else buffer = append(buffer, line) end if end while close( handle ) return buffer end function -- read the file buffer = read_file( "test.htm" ) -- parse the file parse = "" for line = 1 to length( buffer ) do -- clear tag tag = 0 -- clear accumulated text text = "" for char = 1 to length( buffer[line] ) do -- start of html tag if buffer[line][char] = '<' then -- save accumulated text if length( text ) > 0 then parse = append( parse, { STRING, text } ) end if -- inside of tag already? if_err( tag != 0, "Error - unexpected '<' in tag." ) -- start of tag tag = 1 text = "" -- end of html tag elsif buffer[line][char] = '>' then -- was a tag started? if_err( tag = 0, "Error - unexpected '>'.\n" ) -- write tag parse = append( parse, { TAG, text } ) text = "" -- clear flag tag = 0 -- end of line elsif buffer[line][char] = '\n' then -- was tag started? if_err( tag, "Error - unexpected end of line in tag.\n" ) -- text accumulated? if length( text ) > 0 then parse = append( parse, { STRING, text } ) end if -- clear text text = "" -- normal character else -- accumulate text text = text & buffer[line][char] end if end for end for -- show results of parse -- each token for i = 1 to length( parse ) do -- show results, based on type of token if parse[i][1] = STRING then printf( 1, "STRING: %s\n", {parse[i][2]} ) elsif parse[i][1] = TAG then printf( 1, "TAG : %s\n", {parse[i][2]} ) else if_err( 1, "Unknown token." ) end if end for -- END OF CODE Here's a test file: -- TEST FILE BEGINS HERE <B>This is bold<\B> <I>This is italic<\I> <P>This is a new paragraph. <BR>This is a paragraph break. <B>This is bold <I>and italic<\I><\B> -- TEST FILE ENDS HERE Hope this helps. -- David Cuny