Parsing
Joe wrote:
> I should know how to do this, but I don't. Could some one show me an
> example of parsing stuff read in from a file. For example sake an
> HTML file, and reading any <P>, <BR>, or <B> tags. Thanks.
Here's a program that will read an HTML file, and return a sequence of token
in the form:
{ { tokenType, tokenValue } ... }
-- CODE BEGINS HERE
-- html.ex
-- simple HTML parser
integer tag
sequence buffer, parse, text
-- token types
constant STRING = 1, -- string of text
TAG = 2 -- html tag
procedure if_err( object test, sequence errMessage )
-- generic error handler
-- if test is true, abort with message
if test then
puts( 1, errMessage & '\n' )
abort( 0 )
end if
end procedure
function read_file( sequence fName )
-- see help file under 'gets()'
-- read file fName, return as sequence
atom handle
sequence buffer
object line
-- open file
handle = open( fName, "r" )
if_err( handle = -1, "Unable to open file " & fName & "." )
-- clear buffer
buffer = {}
-- read until end of file
while 1 do
line = gets(handle)
if atom(line) then
exit -- end of file
else
buffer = append(buffer, line)
end if
end while
close( handle )
return buffer
end function
-- read the file
buffer = read_file( "test.htm" )
-- parse the file
parse = ""
for line = 1 to length( buffer ) do
-- clear tag
tag = 0
-- clear accumulated text
text = ""
for char = 1 to length( buffer[line] ) do
-- start of html tag
if buffer[line][char] = '<' then
-- save accumulated text
if length( text ) > 0 then
parse = append( parse, { STRING, text } )
end if
-- inside of tag already?
if_err( tag != 0, "Error - unexpected '<' in tag." )
-- start of tag
tag = 1
text = ""
-- end of html tag
elsif buffer[line][char] = '>' then
-- was a tag started?
if_err( tag = 0, "Error - unexpected '>'.\n" )
-- write tag
parse = append( parse, { TAG, text } )
text = ""
-- clear flag
tag = 0
-- end of line
elsif buffer[line][char] = '\n' then
-- was tag started?
if_err( tag, "Error - unexpected end of line in tag.\n" )
-- text accumulated?
if length( text ) > 0 then
parse = append( parse, { STRING, text } )
end if
-- clear text
text = ""
-- normal character
else
-- accumulate text
text = text & buffer[line][char]
end if
end for
end for
-- show results of parse
-- each token
for i = 1 to length( parse ) do
-- show results, based on type of token
if parse[i][1] = STRING then
printf( 1, "STRING: %s\n", {parse[i][2]} )
elsif parse[i][1] = TAG then
printf( 1, "TAG : %s\n", {parse[i][2]} )
else
if_err( 1, "Unknown token." )
end if
end for
-- END OF CODE
Here's a test file:
-- TEST FILE BEGINS HERE
<B>This is bold<\B>
<I>This is italic<\I>
<P>This is a new paragraph.
<BR>This is a paragraph break.
<B>This is bold <I>and italic<\I><\B>
-- TEST FILE ENDS HERE
Hope this helps.
-- David Cuny
|
Not Categorized, Please Help
|
|