Jeg vil gerne være vært ved en gratis HTML parser :
////////////////////////////////////////////////////////////////////////////////
unit HTMLParser;
interface
uses
Windows, Messages, SysUtils, Classes, Graphics, Controls, Forms, Dialogs;
type
TOnFoundHyperlink = procedure(Sender: TObject; Hyperlink: string) of object;
TOnFoundTag = procedure(Sender: TObject; Tag: string) of object;
TOnFoundText = procedure(Sender: TObject; Text: string) of object;
TOnFoundComment = procedure(Sender: TObject; Comment: string) of object;
TOnParsing = procedure(Sender: TObject; Lines, Total: Integer) of object;
TLocalRemote = class(TPersistent)
private
FLocal: TStrings;
FRemote: TStrings;
protected
public
constructor Create;
destructor Destroy; override;
published
property Local: TStrings read FLocal write FLocal;
property Remote: TStrings read FRemote write FRemote;
end;
TParsed = class(TPersistent)
private
FHyperlinks: TStrings; { All Related Document, Local+Remote }
FImages: TStrings; { Document in <IMG SRC=...}
FEmails: TStrings; { Anchors <A.. with "mailto:" }
FHTTPLinks: TStrings; { Anchors <A.. with "http:" }
FFTPLinks: TStrings; { Anchors <A.. with "ftp:" }
FLocalLinks: TStrings; { Anchors <A.. with no protocol }
FFramePages: TStrings; { <FRAME SRC=... }
{ .. by extension.. }
FHTML: TLocalRemote; { *.HTM,*.HTML,*.SHTML,*.DHTML }
FImage: TLocalRemote; { *.JPEG, *.GIF, *.TIF .. }
FMedia: TLocalRemote; { *.AVI, *.MP3, *.MPEG ... }
FJava: TLocalRemote; { *.JAVA, *.CLASS, *.JS }
FActiveX: TLocalRemote; { *.CAB .. }
FASP: TLocalRemote; { *.ASP }
FPHP: TLocalRemote; { *.PHP, *.PHP3 ... }
FPERL: TLocalRemote; { *.PL, *.CGI.. }
FTitle: string; { <TITLE>...</TITLE> }
FKeyword: string; { <META NAME="KEYWORD" ... }
FDescription: string; { <META NAME="DESCRIPTION" ... }
protected
public
constructor Create;
destructor Destroy; override;
published
property Title: string read FTitle write FTitle;
property Keyword: string read FKeyWord write FKeyword;
property Description: string read FDescription write FDescription;
property HTML: TLocalRemote read FHTML write FHTML;
property Image: TLocalRemote read FImage write FImage;
property Media: TLocalRemote read FMedia write FMedia;
property Java: TLocalRemote read FJava write FJava;
property ActiveX: TLocalRemote read FActiveX write FActiveX;
property ASP: TLocalRemote read FASP write FASP;
property PHP: TLocalRemote read FPHP write FPHP;
property PERL: TLocalRemote read FPERL write FPERL;
property Hyperlinks: TStrings read FHyperlinks write FHyperlinks;
property Images: TStrings read FImages write FImages;
property EMails: TStrings read FEmails write FEmails;
property HTTPlinks: TStrings read FHTTPlinks write FHTTPlinks;
property FTPlinks: TStrings read FFTPlinks write FFTPlinks;
property Locallinks: TStrings read FLocallinks write FLocallinks;
property FramePages: TStrings read FFramePages write FFramePages;
end;
THTMLParser = class
private
FRaw: TStrings;
FParsed: TParsed;
{ Events }
FOnParsing: TOnParsing;
FOnFoundHyperlink: TOnFoundHyperlink;
FOnFoundTag: TOnFoundTag;
FOnFoundText: TOnFoundText;
FOnFoundComment: TOnFoundComment;
FParsedLines: Integer;
FCurrentLine: Integer;
FTotalLines: Integer;
procedure SetRaw(Value: TStrings);
protected
Buffering: Boolean;
Buffer: string;
BufferingTT: string; { Tag }
procedure AddTag(Tag: string);
procedure AddText(Text: string);
public
constructor Create;
destructor Destroy; override;
procedure Execute;
property ParsedLines: Integer read FParsedLines;
property TotalLines: Integer read FTotalLines;
property CurrentLine: Integer read FCurrentLine;
published
property HTML: TStrings read FRaw write SetRaw;
property Parsed: TParsed read FParsed write FParsed;
property OnFoundComment: TOnFoundComment read FOnFoundComment write FOnFoundComment;
property OnFoundHyperlink: TOnFoundHyperLink read FOnFoundHyperLink write FOnFoundHyperLink;
property OnFoundTag: TOnFoundTag read FOnFoundTag write FOnFoundTag;
property OnFoundText: TOnFoundText read FOnFoundText write FOnFoundText;
property OnParsing: TOnParsing read FOnParsing write FOnParsing;
end;
implementation
procedure ParseURL(const URL: string; var Protocol, Server, Script, Variable: string);
var
S: string;
begin
if Url = '' then
Exit;
Protocol := '';
Server := '';
S := URL;
if Pos('://', S) <> 0 then
begin
Protocol := Copy(S, 1, Pos('://', S) - 1);
Delete(S, 1, Pos('://', S) + 2);
end
else if Pos('MAILTO:', S) <> 0 then
begin
Protocol := 'MAILTO:';
Delete(S, 1, 7);
end
else
Protocol := 'http';
while Pos('/', S) <> 0 do
begin
Server := Server + Copy(S, 1, Pos('/', S));
Delete(S, 1, Pos('/', S));
end;
if Pos('.', S) = 0 then
begin
Server := Server + S;
if Server[Length(Server)] <> '/' then
Server := Server + '/';
end
else
begin
if Pos('?', S) <> 0 then
begin
Script := Copy(S, 1, Pos('?', S) - 1);
Delete(S, 1, Pos('?', S));
Variable := S;
end
else
Script := S;
end;
end;
function ExtractQuotedStr(Str: string; Quote: Char): string;
var
StartPos, Index: integer;
begin
Result := '';
StartPos := Pos(Quote, Str);
for Index := StartPos + 1 to Length(Str) do
if Str[Index] <> Quote then
Result := Result + Str[Index]
else
Break;
end;
function GetTagAttribute(Tag, Attribute: string): string;
var
AttrPos, NulPos, Count: Integer;
UTag, UAttribute: string;
Quoted: Boolean;
begin
NulPos := 0;
Quoted := False;
UTag := Uppercase(Tag);
UAttribute := Uppercase(Attribute);
AttrPos := Pos(UAttribute, UTag);
if AttrPos <> 0 then
begin
for Count := AttrPos to Length(Tag) do
begin
if (Tag[Count] = '"') then
begin
if not Quoted then
Quoted := True
else
Quoted := False;
end;
if ((Tag[Count] = ' ') and not Quoted) or (Tag[Count] = '>') or (Count = Length(tag)) then
begin
NulPos := Count;
Break;
end;
end;
Result := Copy(Tag, AttrPos, NulPos - AttrPos + 1);
end;
end;
function ExtractValue(Attribute: string): string;
var
Str: string;
Count: Integer;
StartPos: Integer;
Quoted: Boolean;
begin
Result := '';
Quoted := False;
Str := Attribute;
StartPos := Pos('=', Attribute);
for Count := StartPos + 1 to Length(Attribute) do
begin
if (Attribute[Count] <> '"') or (not Quoted and (Attribute[Count] <> ' ')) then
Result := Result + Attribute[Count]
else
begin
if (Attribute[Count] = '"') and not Quoted then
Quoted := True
else
Break;
end;
end;
Result := ExtractQuotedStr(Result, '"');
end;
constructor TLocalRemote.Create;
begin
inherited Create;
FLocal := TStringList.Create;
FRemote := TStringList.Create;
end;
destructor TLocalRemote.Destroy;
begin
FLocal.Free;
FRemote.Free;
inherited Destroy;
end;
constructor TParsed.Create;
begin
inherited Create;
FHyperlinks := TStringList.Create;
FImages := TStringList.Create;
FEmails := TStringList.Create;
FHTTPLinks := TStringList.Create;
FFTPLinks := TStringList.Create;
FLocalLinks := TStringList.Create;
FFramePages := TStringList.Create;
FHTML := TLocalRemote.Create;
FImage := TLocalRemote.Create;
FMedia := TLocalRemote.Create;
FJava := TLocalRemote.Create;
FActiveX := TLocalRemote.Create;
FPHP := TLocalRemote.Create;
FASP := TLocalRemote.Create;
FPERL := TLocalRemote.Create;
end;
destructor TParsed.Destroy;
begin
FHTML.Free;
FImage.Free;
FMedia.Free;
FJava.Free;
FActiveX.Free;
FPHP.Free;
FASP.Free;
FPERL.Free;
FFramePages.Free;
FHTTPLinks.Free;
FFTPLinks.Free;
FLocalLinks.Free;
FEmails.Free;
FHyperlinks.Free;
FImages.Free;
inherited Destroy;
end;
procedure THTMLParser.SetRaw(Value: TStrings);
begin
FRaw.Clear;
FRaw.Assign(Value);
end;
procedure THTMLParser.AddTag(Tag: string);
var
TagName: string;
Hyperlink: string;
UHyperLink: string;
MetaName, MetaContent: string;
P, S, Sc, Vr: string;
Ext: string;
begin
if Buffering then
begin
if Pos(Uppercase(BufferingTT), Uppercase(Tag)) = 0 then
begin
Buffer := Buffer + '<' + Tag + '>';
Exit;
end;
end;
{ Remove all trailing spaces }
Trim(Tag);
if Pos('!--', Tag) = 0 then
if Assigned(FOnFoundTag) then
FOnFoundTag(Self, Tag);
if Pos(' ', Tag) <> 0 then
TagName := Uppercase(Copy(Tag, 1, Pos(' ', Tag) - 1))
else
TagName := Uppercase(Tag);
if TagName = 'A' then { Anchors }
begin
Hyperlink := ExtractValue(GetTagAttribute(Tag, 'href'));
UHyperlink := Uppercase(Hyperlink);
ParseURL(UHyperlink, P, S, Sc, Vr);
{ Extension Check }
Ext := ExtractFileExt(SC);
if Pos('HTM', Ext) <> 0 then
begin
if Pos('
HTTP://', UHyperlink) <> 0 then
FParsed.FHTML.FRemote.Add(Hyperlink)
else
FParsed.FHTML.FLocal.Add(Hyperlink);
end
else if ((Ext = '.JPG') or
(Ext = '.JPEG') or
(Ext = '.GIF') or
(Ext = '.TIF') or
(Ext = '.PCX') or
(Ext = '.PNG') or
(Ext = '.BMP')) then
begin
if Pos('
HTTP://', UHyperlink) <> 0 then
FParsed.FImage.FRemote.Add(Hyperlink)
else
FParsed.FImage.FLocal.Add(Hyperlink);
end
else if ((Ext = '.AVI') or
(Ext = '.MP3') or
(Ext = '.AU') or
(Ext = '.MOV') or
(Ext = '.MPG') or
(Ext = '.MPEG')) then
begin
if Pos('
HTTP://', UHyperlink) <> 0 then
FParsed.FMedia.FRemote.Add(Hyperlink)
else
FParsed.FMedia.FLocal.Add(Hyperlink);
end
else if ((Ext = '.JS') or
(Ext = '.CLASS') or
(Ext = '.JAVA')) then
begin
if Pos('
HTTP://', UHyperlink) <> 0 then
FParsed.FJava.FRemote.Add(Hyperlink)
else
FParsed.FJava.FLocal.Add(Hyperlink);
end
else if ((Ext = '.ASP')) then
begin
if Pos('
HTTP://', UHyperlink) <> 0 then
FParsed.FASP.FRemote.Add(Hyperlink)
else
FParsed.FASP.FLocal.Add(Hyperlink);
end
else if ((Ext = '.PL') or
(Ext = '.CGI')) then
begin
if Pos('
HTTP://', UHyperlink) <> 0 then
FParsed.FPERL.FRemote.Add(Hyperlink)
else
FParsed.FPERL.FLocal.Add(Hyperlink);
end
else if Pos('PHP', Ext) <> 0 then
begin
if Pos('
HTTP://', UHyperlink) <> 0 then
FParsed.FPHP.FRemote.Add(Hyperlink)
else
FParsed.FPHP.FLocal.Add(Hyperlink);
end
else if ((Ext = '.CAB')) then
begin
if Pos('
HTTP://', UHyperlink) <> 0 then
FParsed.FActiveX.FRemote.Add(Hyperlink)
else
FParsed.FActiveX.FLocal.Add(Hyperlink);
end;
if Pos('MAILTO:', Uppercase(hyperlink)) <> 0 then
begin
FParsed.FEmails.Add(Copy(Hyperlink, 8, Length(Hyperlink) - 7));
end
else if Pos('
FTP://', UHyperlink) <> 0 then
begin
FParsed.FFTPLinks.Add(Hyperlink);
end
else if Pos('
HTTP://', UHyperlink) <> 0 then
begin
FParsed.FHTTPLinks.Add(Hyperlink);
end
else if Pos('://', UHyperlink) = 0 then
begin
FParsed.FLocalLinks.Add(Hyperlink);
end
else
if Assigned(FOnFoundHyperlink) then
FOnFoundHyperlink(Self, Hyperlink);
FParsed.FHyperlinks.Add(Hyperlink);
end
else
if TagName = 'IMG' then { Image }
begin
FParsed.FImages.Add(ExtractValue(GetTagAttribute(Tag, 'src')));
end
else
if TagName = 'FRAME' then { Frame }
begin
FParsed.FFramePages.Add(ExtractValue(GetTagAttribute(Tag, 'src')));
end
else
if TagName = 'META' then { Meta }
begin
MetaName := Uppercase(ExtractValue(GetTagAttribute(Tag, 'name')));
MetaContent := ExtractValue(GetTagAttribute(Tag, 'content'));
if MetaName = 'KEYWORD' then
FParsed.FKeyword := MetaContent
else if MetaName = 'DESCRIPTION' then
FParsed.FDescription := MetaContent;
end
else
if TagName = '!--' then { Comment }
begin
if Assigned(FOnFoundComment) then
FOnFoundComment(Self, Copy(Tag, 5, Length(Tag) - 3 - 4));
{ <!-- Comment --> }
end
else
if (TagName = 'TITLE') then { Title }
begin
Buffering := True;
BufferingTT := '/TITLE';
end
else if (TagName = '/TITLE') then
begin
Buffering := False;
BufferingTT := '';
FParsed.FTitle := Buffer;
Buffer := '';
end;
end;
procedure THTMLParser.AddText(Text: string);
begin
if Buffering then
Buffer := Text;
if not Buffering then
if Assigned(FOnFoundText) then
FOnFoundText(Self, Text);
Text := '';
end;
procedure THTMLParser.Execute;
var
LinesIndex: Integer;
Line: string;
CharsIndex: Integer;
TagDepth: Integer;
Tag: string;
InTag: Boolean;
IgnoreChar: Boolean;
Text: string;
I: Integer;
begin
FParsed.Hyperlinks.Clear;
FParsed.Images.Clear;
FParsed.Emails.Clear;
FParsed.HTTPLinks.Clear;
FParsed.FTPLinks.Clear;
FParsed.LocalLinks.Clear;
FParsed.FramePages.Clear;
TagDepth := 0;
Tag := '';
Text := '';
InTag := False;
IgnoreChar := False;
FTotalLines := FRaw.Count;
for LinesIndex := 1 to FRaw.Count do
begin
Line := FRaw[LinesIndex - 1];
if pos('Preload buttons', Line) = 0 then
FCurrentLine := LinesIndex
else
FCurrentLine := LinesIndex;
if Line = '' then
Continue;
for CharsIndex := 1 to Length(Line) do
begin
if IgnoreChar then
IgnoreChar := False;
if Assigned(FOnParsing) and (TagDepth = 0) then
FOnParsing(Self, LinesIndex, FRaw.Count);
if (Line[CharsIndex] = '<') and not InTag then
begin
{ Save text on buffer }
if Text <> '' then
begin
AddText(Text);
Text := '';
end;
InTag := True;
end
else
if (Line[CharsIndex] = '>') and InTag then
begin
Tag := Tag + '>';
IgnoreChar := True;
for I := Length(Tag) downto 1 do
begin
if Tag[I] = '<' then
Break;
end;
AddTag(Copy(Tag, I + 1, Length(tag) - I));
InTag := False;
Delete(Tag, 1, Length(tag) - I + 1);
end; {else }
if not IgnoreChar and not InTag then
begin
{ Text }
Text := Text + Line[CharsIndex];
end
else
if not IgnoreChar and InTag then
begin
{ Tag(s) }
Tag := Tag + Line[CharsIndex];
end;
end;
FParsedLines := LinesIndex;
end;
end;
constructor THTMLParser.Create;
begin
inherited;
FParsed := TParsed.Create;
FRaw := TStringList.Create;
end;
destructor THTMLParser.Destroy;
begin
FParsed.Free;
FRaw.Free;
inherited;
end;
end.