Re: Converting with Regular Expressions

new topic     » goto parent     » topic index » view thread      » older message » newer message
--demo regex extracting html links & text 
 
 
include std/io.e 
include std/error.e 
include std/regex.e as re 
include std/sequence.e as seq 
 
--put filename here to read html from file 
-- may need to preprocess, split on <a> 
sequence fname = ""   
 
sequence html =  seq:split(` 

____<a href="http://careers.overflow.com">careers</a> 
     <div id="question-header"> 
	<h1><a href="/questions/468/custom-regexp-function" class="question-hyperlink">Custom REGEXP Function</a></h1> 
	</div> 
	<a href="http://www.mydomain.com/index.esp?var1=1&var2=2">A LINK</a> 
      `,'\n', 1) 

 
--use object instead of regex, 
-- to avoid typecheck error on bad regex to show error_message() 
-- {?i} case insensitive 
 
--extract rawlink <a..</a 
object h_link = re:new("(?i)\\<a.*\\/a[ \t\\\\]*\\>") 
 
--extract only href from rawlink, capture linkurl, linktext 
object h_href = re:new("(?i)href[ \t]*=[ \t\"']*([^\"'>]*).*\\>([^<]*)\\<") 
 
if atom(h_link) then 
	crash("error h_link %s\n", {re:error_message(h_link)}) 
elsif atom(h_href) then 
	crash("error h_href %s\n", {re:error_message(h_href)}) 
end if 
 
--** 
-- limitation to simplify, link is on one line, 
-- but there can be other text/html on that line. 
-- will mangle some/many url that misuse quotes or brackets 
-- doesn't try to catch multiple links on one line 
-- won't pick bare url  
-- edit to suit 
 
public function process_links(sequence lines) 
	object result, resultn 
	sequence line, pross = {} 
 
	for x= 1 to length(lines) do 
		if atom(lines[x]) or not length(lines[x]) then continue end if 
 
		result = re:find(h_link, lines[x]) 
         
		if atom(result) then --no link 
			continue  
		end if 
		 
        -- depending too much on well formed html 
        -- will miss more than one link on line 
        -- all fixable with more time and error checking 
		 
		 
		line = lines[x][result[1][1]..result[1][2]] 
		resultn = re:find(h_href, line) 
         
		if atom(resultn) then  
			printf(2, "href re err=%d %s\n", {resultn, line }) 
			continue  
		end if 
		if length(resultn)<3 then 
			printf(2, "href len err=%d %s\n", {length(resultn), line }) 
			continue  
		end if 
		pross = append(pross,{ 
					line[resultn[2][1]..resultn[2][2]], --link 
					line[resultn[3][1]..resultn[3][2]], --linktext 
					$ 
					}) 
 		--?1/0 
	end for 
 
	return pross 
end function 
 
sequence links 
if length(fname) then 
	links = process_links(read_lines(fname)) 
else 
	links = process_links(html) 
end if 
 
puts(1," Results:\n") 
for i= 1 to length(links) do 
	printf(1,"[[%s -> %s]]\n",{ 
			links[i][2], 
			links[i][1], 
			$ 
			}) 
end for 
 
/* 

    *  Posted by euphoric  
I need to convert something like this: 
<a href="http://www.mydomain.com/index.esp?var1=1&var2=2">A LINK</a> 
into 
[[http://www.mydomain.com/index.esp?var1=1&var2=2 -> A LINK]] 
 
I'd like to use a regex match. 
 
note in euphoria creole it's 
 [[link text -> link]] 
 [[link | link text]] 
 [[linktext link]] 
(hope I got that right) 
 
 Results: 
[[careers -> http://careers.overflow.com]] 
[[Custom REGEXP Function -> /questions/468/custom-regexp-function]] 
[[A LINK -> http://www.mydomain.com/index.esp?var1=1&var2=2]] 
 
*/ 
new topic     » goto parent     » topic index » view thread      » older message » newer message

Search



Quick Links

User menu

Not signed in.

Misc Menu