Re: Converting with Regular Expressions
- Posted by ne1uno
Jan 23, 2011
--demo regex extracting html links & text
include std/io.e
include std/error.e
include std/regex.e as re
include std/sequence.e as seq
--put filename here to read html from file
-- may need to preprocess, split on <a>
sequence fname = ""
sequence html = seq:split(`
____<a href="http://careers.overflow.com">careers</a>
<div id="question-header">
<h1><a href="/questions/468/custom-regexp-function" class="question-hyperlink">Custom REGEXP Function</a></h1>
</div>
<a href="http://www.mydomain.com/index.esp?var1=1&var2=2">A LINK</a>
`,'\n', 1)
--use object instead of regex,
-- to avoid typecheck error on bad regex to show error_message()
-- {?i} case insensitive
--extract rawlink <a..</a
object h_link = re:new("(?i)\\<a.*\\/a[ \t\\\\]*\\>")
--extract only href from rawlink, capture linkurl, linktext
object h_href = re:new("(?i)href[ \t]*=[ \t\"']*([^\"'>]*).*\\>([^<]*)\\<")
if atom(h_link) then
crash("error h_link %s\n", {re:error_message(h_link)})
elsif atom(h_href) then
crash("error h_href %s\n", {re:error_message(h_href)})
end if
--**
-- limitation to simplify, link is on one line,
-- but there can be other text/html on that line.
-- will mangle some/many url that misuse quotes or brackets
-- doesn't try to catch multiple links on one line
-- won't pick bare url
-- edit to suit
public function process_links(sequence lines)
object result, resultn
sequence line, pross = {}
for x= 1 to length(lines) do
if atom(lines[x]) or not length(lines[x]) then continue end if
result = re:find(h_link, lines[x])
if atom(result) then --no link
continue
end if
-- depending too much on well formed html
-- will miss more than one link on line
-- all fixable with more time and error checking
line = lines[x][result[1][1]..result[1][2]]
resultn = re:find(h_href, line)
if atom(resultn) then
printf(2, "href re err=%d %s\n", {resultn, line })
continue
end if
if length(resultn)<3 then
printf(2, "href len err=%d %s\n", {length(resultn), line })
continue
end if
pross = append(pross,{
line[resultn[2][1]..resultn[2][2]], --link
line[resultn[3][1]..resultn[3][2]], --linktext
$
})
--?1/0
end for
return pross
end function
sequence links
if length(fname) then
links = process_links(read_lines(fname))
else
links = process_links(html)
end if
puts(1," Results:\n")
for i= 1 to length(links) do
printf(1,"[[%s -> %s]]\n",{
links[i][2],
links[i][1],
$
})
end for
/*
* Posted by euphoric
I need to convert something like this:
<a href="http://www.mydomain.com/index.esp?var1=1&var2=2">A LINK</a>
into
[[http://www.mydomain.com/index.esp?var1=1&var2=2 -> A LINK]]
I'd like to use a regex match.
note in euphoria creole it's
[[link text -> link]]
[[link | link text]]
[[linktext link]]
(hope I got that right)
Results:
[[careers -> http://careers.overflow.com]]
[[Custom REGEXP Function -> /questions/468/custom-regexp-function]]
[[A LINK -> http://www.mydomain.com/index.esp?var1=1&var2=2]]
*/
Not Categorized, Please Help
|
|