diff --git a/src/RobotsTxt.php b/src/RobotsTxt.php index 6f872d8..15766f3 100644 --- a/src/RobotsTxt.php +++ b/src/RobotsTxt.php @@ -126,38 +126,60 @@ protected function getDisallowsPerUserAgent(string $content): array $disallowsPerUserAgent = []; - $currentUserAgent = null; + $currentUserAgents = []; + + $treatAllowDisallowLine = false; foreach ($lines as $line) { - if ($this->isCommentLine($line)) { + if ($this->isComment($line)) { + continue; + } + + if ($this->isEmptyLine($line)) { continue; } if ($this->isUserAgentLine($line)) { + if ($treatAllowDisallowLine) { + $treatAllowDisallowLine = false; + $currentUserAgents = []; + } $disallowsPerUserAgent[$this->parseUserAgent($line)] = []; - $currentUserAgent = &$disallowsPerUserAgent[$this->parseUserAgent($line)]; + $currentUserAgents[] = &$disallowsPerUserAgent[$this->parseUserAgent($line)]; continue; } - if ($currentUserAgent === null) { + if ($this->isDisallowLine($line)) { + $treatAllowDisallowLine = true; + } + + if ($this->isAllowLine($line)) { + $treatAllowDisallowLine = true; continue; } $disallowUrl = $this->parseDisallow($line); - $currentUserAgent[$disallowUrl] = $disallowUrl; + foreach ($currentUserAgents as &$currentUserAgent) { + $currentUserAgent[$disallowUrl] = $disallowUrl; + } } return $disallowsPerUserAgent; } - protected function isCommentLine(string $line): bool + protected function isComment(string $line): bool { return strpos(trim($line), '#') === 0; } + protected function isEmptyLine(string $line): bool + { + return trim($line) === ''; + } + protected function isUserAgentLine(string $line): bool { return strpos(trim(strtolower($line)), 'user-agent') === 0; @@ -173,6 +195,16 @@ protected function parseDisallow(string $line): string return trim(substr_replace(strtolower(trim($line)), '', 0, 8), ': '); } + protected function isDisallowLine(string $line): string + { + return trim(substr(str_replace(' ', '', strtolower(trim($line))), 0, 8), ': ') === 'disallow'; + } + + protected function isAllowLine(string $line): string + { + return trim(substr(str_replace(' ', '', strtolower(trim($line))), 0, 6), ': ') === 'allow'; + } + /** * @deprecated */ diff --git a/tests/RobotsTxtTest.php b/tests/RobotsTxtTest.php index 51cfa71..c8063bb 100644 --- a/tests/RobotsTxtTest.php +++ b/tests/RobotsTxtTest.php @@ -128,4 +128,76 @@ public function the_disallows_user_agent_check_is_case_insensitive() $this->assertFalse($robots->allows('/no-agents', 'UserAgent007')); $this->assertFalse($robots->allows('/no-agents', strtolower('UserAgent007'))); } + + /** @test */ + public function it_can_handle_multiple_user_agent_query_strings() + { + $robots = RobotsTxt::readFrom(__DIR__.'/data/robots.txt'); + + $this->assertFalse($robots->allows('/en/admin?print=true', 'UserAgent010')); + $this->assertFalse($robots->allows('/en/admin?print=true', 'UserAgent011')); + $this->assertTrue($robots->allows('/en/admin?print=true', 'UserAgent012')); + $this->assertTrue($robots->allows('/en/admin?print=true', 'UserAgent013')); + } + + /** @test */ + public function it_can_handle_multiple_user_agent_root_path() + { + $robots = RobotsTxt::readFrom(__DIR__.'/data/robots.txt'); + + $this->assertTrue($robots->allows('/', 'UserAgent010')); + $this->assertTrue($robots->allows('/', 'UserAgent011')); + $this->assertTrue($robots->allows('/', 'UserAgent012')); + $this->assertTrue($robots->allows('/', 'UserAgent013')); + } + + /** @test */ + public function it_can_handle_multiple_user_agent_first_in_list() + { + $robots = RobotsTxt::readFrom(__DIR__.'/data/robots.txt'); + + $this->assertTrue($robots->allows('/fr/ad', 'UserAgent010')); + $this->assertFalse($robots->allows('/fr/admin', 'UserAgent010')); + $this->assertTrue($robots->allows('/fr/admin/', 'UserAgent010')); + $this->assertTrue($robots->allows('/fr/admin?', 'UserAgent010')); + $this->assertTrue($robots->allows('/fr/admin?test', 'UserAgent010')); + } + + /** @test */ + public function it_can_handle_multiple_user_agent_last_in_list() + { + $robots = RobotsTxt::readFrom(__DIR__.'/data/robots.txt'); + + $this->assertTrue($robots->allows('/fr/ad', 'UserAgent011')); + $this->assertFalse($robots->allows('/fr/admin', 'UserAgent011')); + $this->assertTrue($robots->allows('/fr/admin/', 'UserAgent011')); + $this->assertTrue($robots->allows('/fr/admin?', 'UserAgent011')); + $this->assertTrue($robots->allows('/fr/admin?test', 'UserAgent011')); + } + + /** @test */ + public function it_can_handle_multiple_user_agent_first_in_list_with_empty_and_comment_lines() + { + $robots = RobotsTxt::readFrom(__DIR__.'/data/robots.txt'); + + $this->assertTrue($robots->allows('/fr/ad', 'UserAgent012')); + $this->assertTrue($robots->allows('/fr/admin', 'UserAgent012')); + $this->assertTrue($robots->allows('/fr/admin/', 'UserAgent012')); + $this->assertTrue($robots->allows('/fr/admin?', 'UserAgent012')); + $this->assertTrue($robots->allows('/fr/admin?test', 'UserAgent012')); + $this->assertFalse($robots->allows('/es/admin-disallow/', 'UserAgent013')); + } + + /** @test */ + public function it_can_handle_multiple_user_agent_last_in_list_with_empty_and_comment_line() + { + $robots = RobotsTxt::readFrom(__DIR__.'/data/robots.txt'); + + $this->assertTrue($robots->allows('/fr/ad', 'UserAgent013')); + $this->assertTrue($robots->allows('/fr/admin', 'UserAgent013')); + $this->assertTrue($robots->allows('/fr/admin/', 'UserAgent013')); + $this->assertTrue($robots->allows('/fr/admin?', 'UserAgent013')); + $this->assertTrue($robots->allows('/fr/admin?test', 'UserAgent013')); + $this->assertFalse($robots->allows('/es/admin-disallow/', 'UserAgent013')); + } } diff --git a/tests/data/robots.txt b/tests/data/robots.txt index 29813ee..bce017c 100644 --- a/tests/data/robots.txt +++ b/tests/data/robots.txt @@ -12,4 +12,25 @@ User-agent: google Disallow: / User-agent: UserAgent007 -Disallow: /no-agents \ No newline at end of file +Disallow: /no-agents + +User-agent: UserAgent010 +User-agent: UserAgent011 +Disallow: /*?print +Disallow: /nl/admin/ +Disallow: /en/admin/* +Disallow: /fr/admin$ +Disallow: /es/admin-disallow/ + +User-agent: UserAgent012 + +User-agent: UserAgent013 + +Allow: /*?print +Disallow: /nl/admin/ + +Disallow: /en/admin/* +Allow: /fr/admin$ + +#comment +Disallow: /es/admin-disallow/ diff --git a/tests/server/package.json b/tests/server/package.json index 2284928..12ecb51 100644 --- a/tests/server/package.json +++ b/tests/server/package.json @@ -9,6 +9,6 @@ "author": "", "license": "MIT", "dependencies": { - "express": "^4.13.3" + "express": "^4.17.1" } }