@@ -20,16 +20,16 @@ final class Crawler
2020
2121 private \Nette \Http \Url $ startingUrl ;
2222
23- /** @var string[] */
23+ /** @var array<int, string> */
2424 private array $ urlList = [];
2525
26- /** @var string[] */
26+ /** @var array<int, string> */
2727 private array $ allUrls = [];
2828
29- /** @var string[][] */
29+ /** @var array< string, array<int, string>> */
3030 private array $ urlReferences = [];
3131
32- /** @var mixed[][] */
32+ /** @var array<int, array{url: string, message: string, trace: array<int, mixed>}> */
3333 private array $ errors = [];
3434
3535
@@ -161,9 +161,14 @@ public function setTextSeparator(ITextSeparator $textSeparator): void
161161
162162 private function processBasicConfig (string $ url ): void
163163 {
164- $ this ->startingUrl = $ startingUrl = new \Nette \Http \Url ($ url );
164+ $ startingUrl = new \Nette \Http \Url ($ url );
165+ $ this ->startingUrl = $ startingUrl ;
165166 $ this ->addUrl ($ url );
166- $ this ->addUrl (($ startingUrl ->getScheme () === 'https ' ? 'http ' : 'https ' ) . ':// ' . $ startingUrl ->getAuthority ());
167+ $ this ->addUrl (sprintf (
168+ '%s://%s ' ,
169+ $ startingUrl ->getScheme () === 'https ' ? 'http ' : 'https ' ,
170+ $ startingUrl ->getAuthority (),
171+ ));
167172 }
168173
169174
@@ -181,7 +186,7 @@ private function addUrl(string $url): void
181186 if ($ canAdd === true ) { // Is allowed?
182187 $ isAllowed = false ;
183188 foreach ($ this ->config ->getAllowedUrls () as $ allow ) {
184- if (preg_match ('/^ ' . $ allow . '$/ ' , $ url )) {
189+ if (preg_match ('/^ ' . $ allow . '$/ ' , $ url ) === 1 ) {
185190 $ isAllowed = true ;
186191 break ;
187192 }
@@ -193,7 +198,7 @@ private function addUrl(string $url): void
193198 if ($ canAdd === true ) { // Is forbidden?
194199 $ isForbidden = false ;
195200 foreach ($ this ->config ->getForbiddenUrls () as $ forbidden ) {
196- if (preg_match ('/^ ' . $ forbidden . '$/ ' , $ url )) {
201+ if (preg_match ('/^ ' . $ forbidden . '$/ ' , $ url ) === 1 ) {
197202 $ isForbidden = true ;
198203 break ;
199204 }
@@ -236,16 +241,16 @@ private function loadUrl(string $url): HttpResponse
236241 $ header = substr ($ response , 0 , $ headerSize );
237242 $ contentType = '' ;
238243
239- if (preg_match ('/Content-Type:\s+(\S+)/ ' , $ response , $ contentTypeParser )) {
244+ if (preg_match ('/Content-Type:\s+(\S+)/ ' , $ response , $ contentTypeParser ) === 1 ) {
240245 $ contentType = $ contentTypeParser [1 ];
241246 }
242247 if ($ contentType === 'application/xml ' || strncmp ($ contentType , 'text/ ' , 5 ) === 0 ) {
243- $ html = Strings::normalize (( string ) substr ($ response , $ headerSize ));
248+ $ html = Strings::normalize (substr ($ response , $ headerSize ));
244249 $ size = strlen ($ html );
245250
246251 if (
247- strpos ($ html , '<?xml ' ) !== false
248- && preg_match_all ('/<loc>(https?\:\/\/[^\s\<]+)\<\/loc>/ ' , $ html , $ sitemapUrls )
252+ str_contains ($ html , '<?xml ' )
253+ && preg_match_all ('/<loc>(https?\:\/\/[^\s\<]+)\<\/loc>/ ' , $ html , $ sitemapUrls ) === 1
249254 ) {
250255 foreach ($ sitemapUrls [1 ] ?? [] as $ sitemapUrl ) {
251256 if (Validators::isUrl ($ sitemapUrl )) {
@@ -255,7 +260,7 @@ private function loadUrl(string $url): HttpResponse
255260 }
256261 } else {
257262 $ html = '<!-- FILE ' . $ url . ' --> ' ;
258- if (preg_match ('/Content-Length:\s+(\d+)/ ' , $ response , $ contentLength )) {
263+ if (preg_match ('/Content-Length:\s+(\d+)/ ' , $ response , $ contentLength ) === 1 ) {
259264 $ size = (int ) $ contentLength [1 ];
260265 } else {
261266 $ size = strlen ($ response ) - $ headerSize ;
@@ -271,7 +276,7 @@ private function loadUrl(string $url): HttpResponse
271276 $ this ->formatHeaders ($ header ),
272277 self ::timer ($ url ) * 1_000 ,
273278 (int ) ($ httpCodeParser ['httpCode ' ] ?? 500 ),
274- $ size < 0 ? 0 : $ size ,
279+ max ( $ size, 0 ) ,
275280 );
276281 }
277282
@@ -294,7 +299,7 @@ private function formatHeaders(string $header): array
294299 {
295300 $ return = [];
296301 foreach (explode ("\n" , Strings::normalize ($ header )) as $ _header ) {
297- if (preg_match ('/^(?<name>[^:]+):\s*(?<value>.*)$/ ' , $ _header , $ headerParser )) {
302+ if (preg_match ('/^(?<name>[^:]+):\s*(?<value>.*)$/ ' , $ _header , $ headerParser ) === 1 ) {
298303 $ return [$ headerParser ['name ' ]] = $ headerParser ['value ' ];
299304 }
300305 }
@@ -309,10 +314,10 @@ private function formatHeaders(string $header): array
309314 private function getLinksFromHTML (string $ url , string $ html ): array
310315 {
311316 $ return = [];
312- if (preg_match_all ('/<a[^>]+>/ ' , $ html , $ aLinks )) {
317+ if (preg_match_all ('/<a[^>]+>/ ' , $ html , $ aLinks ) > 0 ) {
313318 foreach ($ aLinks [0 ] as $ aLink ) {
314- if (preg_match ('/href=[ \'"](?<url>[^ \'"]+)[ \'"]/ ' , $ aLink , $ link )
315- && ! preg_match ('/^(?:mailto|tel|phone)\:/ ' , $ link ['url ' ])
319+ if (preg_match ('/href=[ \'"](?<url>[^ \'"]+)[ \'"]/ ' , $ aLink , $ link ) === 1
320+ && preg_match ('/^(?:mailto|tel|phone)\:/ ' , $ link ['url ' ]) !== 1
316321 ) {
317322 $ formattedLink = RelativeUrlToAbsoluteUrl::process ($ url , $ link ['url ' ]);
318323 if ($ formattedLink !== null && !in_array ($ formattedLink , $ return , true )) {
@@ -351,9 +356,10 @@ private function processRobots(string $url): ?string
351356 $ response = $ this ->loadUrl ($ url );
352357 if ($ response ->getHttpCode () === 200 ) {
353358 $ this ->addUrl ($ url );
354- foreach (explode ("\n" , $ return = Strings::normalize ($ response ->getHtml ())) as $ line ) {
359+ $ return = Strings::normalize ($ response ->getHtml ());
360+ foreach (explode ("\n" , $ return ) as $ line ) {
355361 $ line = trim ($ line );
356- if (preg_match ('/^[Ss]itemap:\s+(https?\:\/\/\S+)/ ' , $ line , $ robots )) {
362+ if (preg_match ('/^[Ss]itemap:\s+(https?\:\/\/\S+)/ ' , $ line , $ robots ) === 1 ) {
357363 $ this ->addUrl ($ robots [1 ]);
358364 }
359365 }
0 commit comments