Delphi,幾個實用的HTML解析函數

1)HTML 標籤值攫取函數,任意標籤哦,純字符串分析,可以配合IDHTTP編程

uses StrUtils;

function ExtractHtmlTagValues(const HtmlText: string; TagName, AttribName: string; var Values: TStringList): integer;

function FindFirstCharAfterSpace(const Line: string; StartPos: integer): Integer;
var i: integer;
begin
Result := -1;
for i := StartPos to Length(Line) do
begin
if (Line[i] <> ' ') then
begin
Result := i;
exit;
end;
end;
end;

function FindFirstSpaceAfterChars(const Line: string; StartPos: integer): Integer;
begin
Result := PosEx(' ', Line, StartPos);
end;

function FindFirstSpaceBeforeChars(const Line: string; StartPos: integer): Integer;
var i: integer;
begin
Result := 1;
for i := StartPos downto 1 do
begin
if (Line[i] = ' ') then
begin
Result := i;
exit;
end;
end;
end;

var InnerTag: string;
LastPos, LastInnerPos: Integer;
SPos, LPos, RPos: Integer;
AttribValue: string;
ClosingChar: char;
TempAttribName: string;
begin
Result := 0;
LastPos := 1;
while (true) do
begin
// find outer tags '<' & '>'
LPos := PosEx('<', HtmlText, LastPos);
if (LPos <= 0) then break;
RPos := PosEx('>', HtmlText, LPos+1);
if (RPos <= 0) then
LastPos := LPos + 1
else
LastPos := RPos + 1;

// get inner tag
InnerTag := Copy(HtmlText, LPos+1, RPos-LPos-1);
InnerTag := Trim(InnerTag); // remove spaces
if (Length(InnerTag) < Length(TagName)) then continue;

// check tag name
if (SameText(Copy(InnerTag, 1, Length(TagName)), TagName)) then
begin
// found tag
AttribValue := '';
LastInnerPos := Length(TagName)+1;
while (LastInnerPos < Length(InnerTag)) do
begin
// find first '=' after LastInnerPos
RPos := PosEx('=', InnerTag, LastInnerPos);
if (RPos <= 0) then break;

// this way you can check for multiple attrib names and not a specific attrib
SPos := FindFirstSpaceBeforeChars(InnerTag, RPos);
TempAttribName := Trim(Copy(InnerTag, SPos, RPos-SPos));
if (true) then
begin
// found correct tag
LPos := FindFirstCharAfterSpace(InnerTag, RPos+1);
if (LPos <= 0) then
begin
LastInnerPos := RPos + 1;
continue;
end;
LPos := FindFirstCharAfterSpace(InnerTag, LPos); // get to first char after '='
if (LPos <= 0) then continue;
if ((InnerTag[LPos] <> '"') and (InnerTag[LPos] <> '''')) then
begin
// AttribValue is not between '"' or ''' so get it
RPos := FindFirstSpaceAfterChars(InnerTag, LPos+1);
if (RPos <= 0) then
AttribValue := Copy(InnerTag, LPos, Length(InnerTag)-LPos+1)
else
AttribValue := Copy(InnerTag, LPos, RPos-LPos+1);
end
else
begin
// get url between '"' or '''
ClosingChar := InnerTag[LPos];
RPos := PosEx(ClosingChar, InnerTag, LPos+1);
if (RPos <= 0) then
AttribValue := Copy(InnerTag, LPos+1, Length(InnerTag)-LPos-1)
else
AttribValue := Copy(InnerTag, LPos+1, RPos-LPos-1)
end;

if (SameText(TempAttribName, AttribName)) and (AttribValue <> '') then
begin
Values.Add(AttribValue);
inc(Result);
end;
end;

if (RPos <= 0) then
LastInnerPos := Length(InnerTag)
else
LastInnerPos := RPos+1;
end;
end;
end;
end;


用法示例:
取得頁面中所有鏈接
var
Links : TStringList;
LinkFound,i : Integer;
begin
Links := TStringList.Create;
LinkFound := ExtractHtmlTagValues(HtmlText,'A','HREF',Links);
for i:=0 to LinkFound-1 do
begin
//Add your own codes here
end;
Links.Free;
end;

2)表單元素值攫取函數,可以從HTML文本中按照給定的Input名稱解析出其Value

function GetValByName(S, Sub: string) : string;
var
EleS,EleE,iPos: Integer;
ELeStr,ValSt: String;
St,Ct : Integer;
function FindEleRange(str: string ; front : boolean; posi : integer): Integer;
var
i: integer;
begin
if Front then
begin
for i:=posi-1 downto 1 do
if Str[i]='<' then
begin
Result := i;
break;
end;
end else begin
for i := posi+1 to length(Str) do
if Str[i]='>' then
begin
Result := i;
break;
end;
end;
end;
function FindEnd (str : string; posi : integer) : Integer;
var
i: integer;
begin
for i:=posi to length(str) do
begin
if (str[i] ='"') or (str[i] ='''') or (str[i] =' ') then
begin
result := i-1;
break;
end;
end;
end;
begin
iPos := Pos('name="'+lowercase(Sub)+'"',lowercase(S));
if iPos = 0 then iPos := Pos('name='+lowercase(Sub),lowercase(S));
if iPos = 0 then iPos := Pos('name='''+lowercase(Sub)+'''',lowercase(S));
if iPos = 0 then exit;
EleS := FindEleRange(S,TRUE,iPos);
EleE := FindEleRange(S,FALSE,iPos);
EleStr := Copy(S,EleS,EleE-EleS+1);
ValSt := 'value="';
iPos := Pos(ValSt,EleStr);
if iPos = 0 then
begin
ValSt := 'value=''';
iPos := Pos(ValSt,EleStr);
end;
if iPos = 0 then
begin
ValSt := 'value=';
iPos := Pos(ValSt,EleStr);
end;
St := iPos+length(ValSt);
Ct := FindEnd(EleStr,St)-St+1;
Result := Copy(EleStr,St,Ct);
end;

用法示例:
取得頁面中名爲 Submit 的表單項的值
var
InputValue : String;
begin
InputValue := GetValByName(HtmlText,'Submit');
end;

3)取某兩個字符串中間的字符

function getStrFromHtml(var Source: String; SbStr, bStr, eStr: String): String;
var
I: Integer;
sbPos, bPos, ePos: Integer;
S: String;
begin
S := Source;

Result := '' ;
if SBStr <> '' then
Begin
sbPos := Pos(UpperCase(SbStr), UpperCase(S));
if sbPos > 0 then
Delete(S, 1, sbPos - 1 + length(sbStr))
Else
Exit;
End;

bPos := Pos(UpperCase(bStr), UpperCase(S));
if bPos > 0 then
Delete(S, 1, bPos - 1 + length(bStr))
Else
Exit;

ePos := pos(UpperCase(eStr), UpperCase(S));
if ePos > 0 then
Delete(S, ePos, length(S));

Result := S;
end;

用法實例:
FUserID := getStrFromHtml(reqStr, 'id="userID"', 'value="', '"');
發佈了14 篇原創文章 · 獲贊 14 · 訪問量 10萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章