html网页采集

 

UI_Less.pas:

  1 unit UI_Less;
  2 
  3 interface
  4 
  5 uses
  6   Windows, Classes, Messages, Forms, MsHtml, Urlmon, ActiveX;
  7 
  8 const
  9   WM_USER_STARTWALKING = WM_USER + 1;
 10   DISPID_AMBIENT_DLCONTROL = (-5512);
 11   READYSTATE_COMPLETE = $00000004;
 12 
 13   DLCTL_DLIMAGES = $00000010;
 14   DLCTL_VIDEOS = $00000020;
 15   DLCTL_BGSOUNDS = $00000040;
 16   DLCTL_NO_SCRIPTS = $00000080;
 17   DLCTL_NO_JAVA = $00000100;
 18   DLCTL_NO_RUNACTIVEXCTLS = $00000200;
 19   DLCTL_NO_DLACTIVEXCTLS = $00000400;
 20   DLCTL_DOWNLOADONLY = $00000800;
 21   DLCTL_NO_FRAMEDOWNLOAD = $00001000;
 22   DLCTL_RESYNCHRONIZE = $00002000;
 23   DLCTL_PRAGMA_NO_CACHE = $00004000;
 24   DLCTL_NO_BEHAVIORS = $00008000;
 25   DLCTL_NO_METACHARSET = $00010000;
 26   DLCTL_URL_ENCODING_DISABLE_UTF8 = $00020000;
 27   DLCTL_URL_ENCODING_ENABLE_UTF8 = $00040000;
 28   DLCTL_FORCEOFFLINE = $10000000;
 29   DLCTL_NO_CLIENTPULL = $20000000;
 30   DLCTL_SILENT = $40000000;
 31   DLCTL_OFFLINEIFNOTCONNECTED = $80000000;
 32   DLCTL_OFFLINE = DLCTL_OFFLINEIFNOTCONNECTED;
 33 
 34 type
 35   TUILess = class(TComponent, IUnknown, IDispatch, IPropertyNotifySink,
 36     IOleClientSite)
 37   private
 38     FDocTitle: string;
 39     FBodyText: TStrings;
 40     FBodyHtml: TStrings;
 41   protected
 42     /// IDISPATCH
 43     function Invoke(DispID: Integer; const IID: TGUID; LocaleID: Integer;
 44       Flags: Word; var Params; VarResult, ExcepInfo, ArgErr: Pointer): HResult;
 45       stdcall;
 46     /// IPROPERTYNOTIFYSINK
 47     function OnChanged(DispID: TDispID): HResult; stdcall;
 48     function OnRequestEdit(DispID: TDispID): HResult; stdcall;
 49     /// IOLECLIENTSITE
 50     function SaveObject: HResult; stdcall;
 51     function GetMoniker(dwAssign: Longint; dwWhichMoniker: Longint;
 52       out mk: IMoniker): HResult; stdcall;
 53     function GetContainer(out container: IOleContainer): HResult; stdcall;
 54     function ShowObject: HResult; stdcall;
 55     function OnShowWindow(fShow: BOOL): HResult; stdcall;
 56     function RequestNewObjectLayout: HResult; stdcall;
 57     ///
 58     function LoadUrlFromMoniker: HResult;
 59     function LoadUrlFromFile: HResult;
 60     // * We only use LoadUrlFromMoniker, but we could use LoadUrlFromFile instead.
 61 
 62   public
 63     constructor Create(AOwner: TComponent); override;
 64     destructor Destroy; override;
 65     property DocTitle: string read FDocTitle;
 66     property BodyText: TStrings read FBodyText write FBodyText;
 67     property BodyHtml: TStrings read FBodyHtml write FBodyHtml;
 68     function Get(URL: PWidechar; var IsSucceed: Boolean; IsStop: Boolean)
 69       : IHTMLELEMENTCollection;
 70     procedure GetAnchorList(IC: IHTMLELEMENTCollection; Anchorlist: TStrings);
 71     procedure GetImageList(IC: IHTMLELEMENTCollection; ImageList: TStrings);
 72   end;
 73 
 74 implementation
 75 
 76 var
 77   Doc: IhtmlDocument2;
 78   _URL: PWidechar;
 79 
 80 constructor TUILess.Create(AOwner: TComponent);
 81 begin
 82   inherited Create(AOwner);
 83   FBodyText := TStringList.Create;
 84   FBodyHtml := TStringList.Create;
 85 end;
 86 
 87 destructor TUILess.Destroy;
 88 begin
 89   if Assigned(FBodyText) then
 90     FBodyText.Free;
 91   if Assigned(FBodyHtml) then
 92     FBodyHtml.Free;
 93   inherited Destroy;
 94 end;
 95 
 96 /// CORE ---->>>>>>>>>
 97 function TUILess.Get(URL: PWidechar; var IsSucceed: Boolean; IsStop: Boolean)
 98   : IHTMLELEMENTCollection;
 99 var
100   Cookie: Integer;
101   CP: IConnectionPoint;
102   OleObject: IOleObject;
103   OleControl: IOleControl;
104   CPC: IConnectionPointContainer;
105   All: IHTMLElement;
106   Msg: TMsg;
107   hr: HResult;
108 begin
109   _URL := URL;
110   IsSucceed := false;
111   try
112     CoCreateInstance(CLASS_HTMLDocument, nil, CLSCTX_INPROC_SERVER,
113       IID_IHTMLDocument2, Doc);
114     OleObject := Doc as IOleObject;
115     OleObject.SetClientSite(self);
116     OleControl := Doc as IOleControl;
117     OleControl.OnAmbientPropertyChange(DISPID_AMBIENT_DLCONTROL);
118     CPC := Doc as IConnectionPointContainer;
119     CPC.FindConnectionPoint(IPropertyNotifySink, CP);
120     CP.Advise(self, Cookie);
121     hr := LoadUrlFromMoniker; // alternative: Hr:= LoadUrlFromFile;
122     if ((SUCCEEDED(hr)) or (hr = E_PENDING)) then
123       while (GetMessage(Msg, 0, 0, 0)) do
124       begin
125         if ((Msg.message = WM_USER_STARTWALKING) and (Msg.hwnd = 0)) then
126         begin
127           PostQuitMessage(0);
128           result := Doc.Get_all;
129           All := Doc.Get_body;
130           FDocTitle := string(Doc.nameProp);
131           FBodyText.Text := string(All.outerText);
132           FBodyHtml.Text := string(All.outerHTML);
133           IsSucceed := true;
134         end
135         else
136           DispatchMessage(Msg);
137         if IsStop then
138           Exit;
139       end;
140   except
141     Exit;
142   end;
143 end;
144 
145 function TUILess.Invoke(DispID: Integer; const IID: TGUID; LocaleID: Integer;
146   Flags: Word; var Params; VarResult, ExcepInfo, ArgErr: Pointer): HResult;
147 var
148   I: Integer;
149 begin
150   if DISPID_AMBIENT_DLCONTROL = DispID then
151   begin
152     I := DLCTL_DOWNLOADONLY + DLCTL_NO_SCRIPTS + DLCTL_NO_JAVA +
153       DLCTL_NO_DLACTIVEXCTLS + DLCTL_NO_RUNACTIVEXCTLS;
154     PVariant(VarResult)^ := I;
155     result := S_OK;
156   end
157   else
158     result := DISP_E_MEMBERNOTFOUND;
159 end;
160 
161 function TUILess.OnChanged(DispID: TDispID): HResult;
162 var
163   dp: TDispParams;
164   vResult: OleVariant;
165 begin
166   if (DISPID_READYSTATE = DispID) then
167     if SUCCEEDED((Doc as IhtmlDocument2).Invoke(DISPID_READYSTATE, GUID_null,
168         LOCALE_SYSTEM_DEFAULT, DISPATCH_PROPERTYGET, dp, @vResult, nil, nil))
169       then
170       if Integer(vResult) = READYSTATE_COMPLETE then
171         PostThreadMessage(GetCurrentThreadId(), WM_USER_STARTWALKING, 0, 0);
172 end;
173 
174 function TUILess.LoadUrlFromMoniker: HResult;
175 var
176   Moniker: IMoniker;
177   BindCtx: IBindCTX;
178   PM: IPersistMoniker;
179 begin
180   createURLMoniker(nil, _URL, Moniker);
181   CreateBindCtx(0, BindCtx);
182   PM := Doc as IPersistMoniker;
183   result := PM.Load(LongBool(0), Moniker, BindCtx, STGM_READ)
184 end;
185 
186 function TUILess.LoadUrlFromFile: HResult;
187 var
188   PF: IPersistfile;
189 begin
190   PF := Doc as IPersistfile;
191   result := PF.Load(_URL, 0);
192 end;
193 
194 // 获取图像链接
195 procedure TUILess.GetImageList(IC: IHTMLELEMENTCollection; ImageList: TStrings);
196 var
197   Image: IHTMLImgElement;
198   Disp: IDispatch;
199   x: Integer;
200 begin
201   if IC <> nil then
202   begin
203     for x := 0 to IC.Length - 1 do
204     begin
205       application.ProcessMessages;
206       Disp := IC.item(x, 0);
207       if SUCCEEDED(Disp.QueryInterface(IHTMLImgElement, Image)) then
208         ImageList.add(string(Image.src));
209     end;
210   end;
211 end;
212 
213 // 获取链接
214 procedure TUILess.GetAnchorList(IC: IHTMLELEMENTCollection;
215   Anchorlist: TStrings);
216 var
217   anchor: IHTMLAnchorElement;
218   Disp: IDispatch;
219   x: Integer;
220 begin
221   if IC <> nil then
222   begin
223     for x := 0 to IC.Length - 1 do
224     begin
225       application.ProcessMessages;
226       Disp := IC.item(x, 0);
227       if (SUCCEEDED(Disp.QueryInterface(IHTMLAnchorElement, anchor)) and
228           (anchor.href <> '')) then
229         Anchorlist.add(string(anchor.href));
230     end;
231   end;
232 end;
233 
234 /// Don't Care ------>>>>>>>>>>>
235 function TUILess.OnRequestEdit(DispID: TDispID): HResult;
236 begin
237   result := E_NOTIMPL;
238 end;
239 
240 function TUILess.SaveObject: HResult;
241 begin
242   result := E_NOTIMPL;
243 end;
244 
245 function TUILess.GetMoniker(dwAssign: Longint; dwWhichMoniker: Longint;
246   out mk: IMoniker): HResult;
247 begin
248   result := E_NOTIMPL;
249 end;
250 
251 function TUILess.GetContainer(out container: IOleContainer): HResult;
252 begin
253   result := E_NOTIMPL;
254 end;
255 
256 function TUILess.ShowObject: HResult;
257 begin
258   result := E_NOTIMPL;
259 end;
260 
261 function TUILess.OnShowWindow(fShow: BOOL): HResult;
262 begin
263   result := E_NOTIMPL;
264 end;
265 
266 function TUILess.RequestNewObjectLayout: HResult;
267 begin
268   result := E_NOTIMPL;
269 end;
270 
271 end.
View Code

相关文章: