-
Notifications
You must be signed in to change notification settings - Fork 12
Expand file tree
/
Copy pathfasthtmlparser.pas
More file actions
324 lines (276 loc) · 8.84 KB
/
fasthtmlparser.pas
File metadata and controls
324 lines (276 loc) · 8.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
{%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
FastHTMLParser unit to parse HTML
(disect html into its tags and text.)
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Original AUTHOR : James Azarja, http://www.jazarsoft.com/
CONTRIBUTORS : L505, http://z505.com
NOTES: Modified by Lars to work with both freepascal and delphi, and contains
more features
LEGAL : Copyright (C) 2004 Jazarsoft, All Rights Reserved.
Modified 2005 Lars (L505)
--------------------------------------------------------------------------------
LICENSE/TERMS
--------------------------------------------------------------------------------
This code may be used and modified by anyone so long as this header and
copyright information remains intact.
The code is provided "AS-IS" and without WARRANTY OF ANY KIND,
expressed, implied or otherwise, including and without limitation, any
warranty of merchantability or fitness for a particular purpose.
In no event shall the author be liable for any special, incidental,
indirect or consequential damages whatsoever (including, without
limitation, damages for loss of profits, business interruption, loss
of information, or any other loss), whether or not advised of the
possibility of damage, and on any theory of liability, arising out of
or in connection with the use or inability to use this software.
}
{$IFDEF FPC}{$MODE DELPHI}{$H+}{$ENDIF}
// {$DEFINE DEBUGLN_ON}
unit fasthtmlparser;
interface
uses
{$IFDEF KOL_MCK}
KOL,
{$else}
SysUtils,
{$ENDIF}
Dialogs, //DEBUG REMOVE THIS LATER
htmtool, htmutils;
{$IFDEF DEBUGLN_ON}
// dummy, default debugging
procedure debugproc(s: string);
// for custom debugging, assign this in your units
var debugln: procedure(s: string) = debugproc;
{$ENDIF}
type
// when tag content found in HTML, including names and values
// case insensitive analysis available via NoCaseTag
TOnFoundTag = procedure(NoCaseTag, ActualTag: string) of object;
// procedural:
TOnFoundTagP = procedure(NoCaseTag, ActualTag: string);
// when text found in the HTML
TOnFoundText = procedure(Text: string) of object;
// procedural:
TOnFoundTextP = procedure(Text: string);
// Lars's modified html parser, case insensitive or case sensitive
THTMLParser = class(TObject)
private
FElementFound: boolean;
FElementTag: string;
FElementTagEnd: string;
FElementHtml: string;
FElementName: string;
FElementId: string;
FFindingElementName: boolean;
FFindingElementId: boolean;
procedure NilOnFoundTag(NoCaseTag, ActualTag: string);
procedure NilOnFoundText(Text: string);
procedure ElementOnFoundTag(NoCaseTag, ActualTag: string);
procedure ElementOnFoundText(Text: string);
public
UseTagTextArray: boolean;
OnFoundTag: TOnFoundTag;
OnFoundText: TOnFoundText;
OnFoundTagP: TOnFoundTagP;
OnFoundTextP: TOnFoundTextP;
Raw: Pchar;
constructor Create(sRaw: string);overload;
constructor Create(pRaw: PChar);overload;
procedure Exec;
function GetElementByName(name: string; var Tag: string; var TagEnd: string): string;
function GetElementById(id: string; var Tag: string; var TagEnd: string): string;
end;
implementation
// default debugging, do nothing, let user do his own by assigning DebugLn var
procedure debugproc(s: string);
begin
end;
function CopyBuffer(StartIndex: PChar; Length: Integer): string;
var
S: string;
begin
SetLength(S, Length);
StrLCopy(@S[1], StartIndex, Length);
Result:= S;
end;
{ ************************ THTMLParser ************************************** }
constructor THTMLParser.Create(pRaw: Pchar);
begin
if pRaw = '' then exit;
if pRaw = nil then exit;
Raw:= pRaw;
FElementFound := false;
FElementTag := '';
FElementTagEnd := '';
FElementHtml := '';
FElementName := '';
FElementId := '';
FFindingElementName := false;
FFindingElementId := false;
end;
constructor THTMLParser.Create(sRaw: string);
begin
if sRaw = '' then exit;
Raw:= Pchar(sRaw);
end;
{ default dummy "do nothing" class events if unassigned }
procedure THTMLParser.NilOnFoundTag(NoCaseTag, ActualTag: string);
begin
end;
procedure THTMLParser.NilOnFoundText(Text: string);
begin
end;
procedure THTMLParser.ElementOnFoundTag(NoCaseTag, ActualTag: string);
begin
// tags inside
if FElementFound then FElementHtml := FElementHtml + ActualTag;
if (FElementId <> '') and (FFindingElementName) then begin
if GetVal(ActualTag, 'name') = FElementName then begin
FElementFound := true;
FElementTag := ActualTag;
// FElementName := '';
end;
end;
if (FElementId <> '') and (FFindingElementId) then begin
if GetVal(ActualTag, 'id') = FElementId then begin
FElementFound := true;
FElementTag := ActualTag;
// FElementId := '';
end;
end;
// closer tag
if NoCaseTag[2] = '/' then FElementFound := false;
end;
procedure THTMLParser.ElementOnFoundText(Text: string);
begin
if FElementFound then begin
FElementHtml := FElementHtml + Text;
end;
end;
{ default dummy "do nothing" procedural events if unassigned }
procedure NilOnFoundTagP(NoCaseTag, ActualTag: string);
begin
end;
procedure NilOnFoundTextP(Text: string);
begin
end;
procedure THTMLParser.Exec;
var
L, TL, I: Integer;
Done: Boolean;
TagStart, TextStart, P: PChar; // Pointer to current char.
C: Char;
begin
{$IFDEF DEBUGLN_ON}debugln('FastHtmlParser Exec Begin');{$ENDIF}
{ set nil events once rather than checking for nil each time tag is found }
if not assigned(OnFoundText) then OnFoundText:= NilOnFoundText;
if not assigned(OnFoundTag) then OnFoundTag:= NilOnFoundTag;
if not assigned(OnFoundTextP) then OnFoundTextP:= NilOnFoundTextP;
if not assigned(OnFoundTagP) then OnFoundTagP:= NilOnFoundTagP;
TL:= StrLen(Raw);
I:= 0;
P:= Raw;
Done:= False;
if P <> nil then
begin
TagStart:= nil;
repeat
TextStart:= P;
{ Get next tag position }
while Not (P^ in [ '<', #0 ]) do
begin
Inc(P); Inc(I);
if I >= TL then
begin
Done:= True;
Break;
end;
end;
if Done then Break;
{ Is there any text before ? }
if (TextStart <> nil) and (P > TextStart) then
begin
L:= P - TextStart;
{ Yes, copy to buffer, OO event:}
OnFoundText( CopyBuffer(TextStart, L) );
// procedural:
OnFoundTextP( CopyBuffer(TextStart, L) );
end else
begin
TextStart:= nil;
end;
{ No }
TagStart:= P;
while Not (P^ in [ '>', #0]) do
begin
// Find string in tag
if (P^ = '"') or (P^ = '''') then
begin
C:= P^;
Inc(P); Inc(I); // Skip current char " or '
// Skip until string end
while Not (P^ in [C, #0]) do
begin
Inc(P);Inc(I);
end;
end;
Inc(P);Inc(I);
if I >= TL then
begin
Done:= True;
Break;
end;
end;
if Done then Break;
{ Copy this tag to buffer }
L:= P - TagStart + 1;
// OO event
OnFoundTag(uppercase(CopyBuffer(TagStart, L)), CopyBuffer(TagStart, L ) ); //L505: added uppercase
// procedural
OnFoundTagP(uppercase(CopyBuffer(TagStart, L)), CopyBuffer(TagStart, L ) );
Inc(P); Inc(I);
if I >= TL then Break;
until (Done);
end;
{$IFDEF DEBUGLN_ON}debugln('FastHtmlParser Exec End');{$ENDIF}
end;
function THTMLParser.GetElementByName(name: string; var Tag: string; var TagEnd: string): string;
begin
result := '';
FFindingElementName := true;
OnFoundTag := ElementOnFoundTag;
OnFoundText := ElementOnFoundText;
FElementName := name;
Exec;
OnFoundTag := NilOnFoundTag;
OnFoundText := NilOnFoundText;
Tag := FElementTag;
TagEnd := FElementTagEnd;
result := FElementHtml;
FFindingElementName := false;
FElementTag := '';
FElementTagEnd := '';
FElementHtml := '';
FElementId := '';
FElementName := '';
end;
function THTMLParser.GetElementById(id: string; var Tag: string; var TagEnd: string): string;
begin
result := '';
FFindingElementId := true;
OnFoundTag := ElementOnFoundTag;
OnFoundText := ElementOnFoundText;
FElementId := id;
Exec;
OnFoundTag := NilOnFoundTag;
OnFoundText := NilOnFoundText;
Tag := FElementTag;
TagEnd := FElementTagEnd;
result := FElementHtml;
FFindingElementId := false;
FElementTag := '';
FElementTagEnd := '';
FElementHtml := '';
FElementId := '';
FElementName := '';
end;
end.