rfc9309xml2.original.xml   rfc9309.xml 
<?xml version="1.0" encoding="US-ASCII"?> <?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE rfc SYSTEM "rfc2629.dtd" [ <!DOCTYPE rfc [
<!ENTITY RFC1945 PUBLIC "" "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/re <!ENTITY nbsp "&#160;">
ference.RFC.1945.xml"> <!ENTITY zwsp "&#8203;">
<!ENTITY RFC2046 PUBLIC "" "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/re <!ENTITY nbhy "&#8209;">
ference.RFC.2046.xml"> <!ENTITY wj "&#8288;">
<!ENTITY RFC2119 PUBLIC "" "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/re
ference.RFC.2119.xml">
<!ENTITY RFC3629 PUBLIC "" "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/re
ference.RFC.3629.xml">
<!ENTITY RFC3986 PUBLIC "" "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/re
ference.RFC.3986.xml">
<!ENTITY RFC5234 PUBLIC "" "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/re
ference.RFC.5234.xml">
<!ENTITY RFC8174 PUBLIC "" "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/re
ference.RFC.8174.xml">
<!ENTITY RFC8288 PUBLIC "" "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/re
ference.RFC.8288.xml">
<!ENTITY RFC9110 PUBLIC "" "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/re
ference.RFC.9110.xml">
<!ENTITY RFC9111 PUBLIC "" "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/re
ference.RFC.9111.xml">
]> ]>
<rfc ipr="trust200902" category="std" docName="draft-koster-rep-12" > <rfc xmlns:xi="http://www.w3.org/2001/XInclude" ipr="trust200902" docName="draft
-koster-rep-12" number="9309" obsoletes="" updates="" submissionType="IETF" cate
<?xml-stylesheet type="text/xsl" href="rfc2629.xslt" ?> gory="std" consensus="true" xml:lang="en" tocInclude="true" tocDepth="4" symRefs
="true" sortRefs="true" version="3">
<?rfc toc="yes" ?>
<?rfc tocdepth="4" ?>
<?rfc symrefs="yes" ?>
<?rfc sortrefs="yes"?>
<?rfc compact="yes" ?>
<?rfc subcompact="no"?>
<front> <!-- xml2rfc v2v3 conversion 3.13.0 -->
<title abbrev="REP">Robots Exclusion Protocol</title>
<author initials="M." surname="Koster" fullname="Martijn Koster" role="edito <front>
r"> <title abbrev="Robots Exclusion Protocol (REP)">Robots Exclusion Protocol</t
<organization>Stalworthy Computing, Ltd.</organization> itle>
<seriesInfo name="RFC" value="9309"/>
<author initials="M." surname="Koster" fullname="Martijn Koster">
<address> <address>
<postal> <postal>
<extaddr>Stalworthy Manor Farm</extaddr>
<street>Suton Lane</street> <street>Suton Lane</street>
<city>Wymondham, Norfolk</city> <city>Wymondham, Norfolk</city>
<code>NR18 9JG</code> <code>NR18 9JG</code>
<country>United Kingdom</country> <country>United Kingdom</country>
</postal> </postal>
<email>m.koster@greenhills.co.uk</email> <email>m.koster@greenhills.co.uk</email>
</address> </address>
</author> </author>
<author initials="G." surname="Illyes" fullname="Gary Illyes" role="editor"> <author initials="G." surname="Illyes" fullname="Gary Illyes">
<organization>Google LLC.</organization> <organization>Google LLC</organization>
<address> <address>
<postal> <postal>
<street>Brandschenkestrasse 110</street> <street>Brandschenkestrasse 110</street>
<city>Zurich</city> <city>Zürich</city>
<code>8002</code> <code>8002</code>
<country>Switzerland</country> <country>Switzerland</country>
</postal> </postal>
<email>garyillyes@google.com</email> <email>garyillyes@google.com</email>
</address> </address>
</author> </author>
<author initials="H." surname="Zeller" fullname="Henner Zeller" role="editor <author initials="H." surname="Zeller" fullname="Henner Zeller">
"> <organization>Google LLC</organization>
<organization>Google LLC.</organization>
<address> <address>
<postal> <postal>
<street>1600 Amphitheatre Pkwy</street> <street>1600 Amphitheatre Pkwy</street>
<city>Mountain View, CA</city> <city>Mountain View</city>
<region>CA</region>
<code>94043</code> <code>94043</code>
<country>USA</country> <country>United States of America</country>
</postal> </postal>
<email>henner@google.com</email> <email>henner@google.com</email>
</address> </address>
</author> </author>
<author initials="L." surname="Sassman" fullname="Lizzi Sassman" role="edito <author initials="L." surname="Sassman" fullname="Lizzi Sassman">
r"> <organization>Google LLC</organization>
<organization>Google LLC.</organization>
<address> <address>
<postal> <postal>
<street>Brandschenkestrasse 110</street> <street>Brandschenkestrasse 110</street>
<city>Zurich</city> <city>Zürich</city>
<code>8002</code> <code>8002</code>
<country>Switzerland</country> <country>Switzerland</country>
</postal> </postal>
<email>lizzi@google.com</email> <email>lizzi@google.com</email>
</address> </address>
</author> </author>
<date year="2022" month="September"/>
<date year="2022" month="July" day="06"/> <keyword>robot</keyword>
<keyword>crawler</keyword>
<area>General</area> <keyword>robots.txt</keyword>
<keyword>internet-drafts</keyword>
<abstract> <abstract>
<t> This document specifies and extends the &quot;Robots Exclusion Protoco <t> This document specifies and extends the "Robots Exclusion Protocol"
l&quot; method originally defined by Martijn Koster in 1994 for service owners
method originally defined by Martijn Koster in 1996 for service owners
to control how content served by their services may be accessed, if at to control how content served by their services may be accessed, if at
all, by automatic clients known as crawlers. Specifically, it adds all, by automatic clients known as crawlers. Specifically, it adds
definition language for the protocol and instructions for handling definition language for the protocol, instructions for handling
errors and caching. </t> errors, and instructions for caching. </t>
</abstract> </abstract>
</front> </front>
<middle> <middle>
<section anchor="introduction" title="Introduction"> <section anchor="introduction" numbered="true" toc="default">
<name>Introduction</name>
<t> This document applies to services that provide resources that clients <t> This document applies to services that provide resources that clients
can access through URIs as defined in <xref target="RFC3986"/>. For ex ample, can access through URIs as defined in <xref target="RFC3986" format="d efault"/>. For example,
in the context of HTTP, a browser is a client that displays the conten t of a in the context of HTTP, a browser is a client that displays the conten t of a
web page. </t> web page. </t>
<t> Crawlers are automated clients. Search engines, for instance, have cra
<t> Crawlers are automated clients. Search engines for instance have crawl wlers to
ers to
recursively traverse links for indexing as defined in recursively traverse links for indexing as defined in
<xref target="RFC8288"/>. </t> <xref target="RFC8288" format="default"/>. </t>
<t> It may be inconvenient for service owners if crawlers visit the entire ty of <t> It may be inconvenient for service owners if crawlers visit the entire ty of
their URI space. This document specifies the rules originally defined by their URI space. This document specifies the rules originally defined by
the &quot;Robots Exclusion Protocol&quot; <xref target="ROBOTSTXT"/> t hat crawlers the "Robots Exclusion Protocol" <xref target="ROBOTSTXT" format="defau lt"/> that crawlers
are requested to honor when accessing URIs. </t> are requested to honor when accessing URIs. </t>
<t> These rules are not a form of access authorization. </t> <t> These rules are not a form of access authorization. </t>
<section anchor="requirements-language" numbered="true" toc="default">
<section anchor="requirements-language" title="Requirements Language"> <name>Requirements Language</name>
<t> The key words &quot;<bcp14>MUST</bcp14>&quot;, <t>The key words "<bcp14>MUST</bcp14>", "<bcp14>MUST NOT</bcp14>",
&quot;<bcp14>MUST NOT</bcp14>&quot;, &quot;<bcp14>REQUIRED</bcp14>&q "<bcp14>REQUIRED</bcp14>", "<bcp14>SHALL</bcp14>",
uot;, "<bcp14>SHALL NOT</bcp14>", "<bcp14>SHOULD</bcp14>",
&quot;<bcp14>SHALL</bcp14>&quot;, &quot;<bcp14>SHALL NOT</bcp14>&quo "<bcp14>SHOULD NOT</bcp14>",
t;, "<bcp14>RECOMMENDED</bcp14>", "<bcp14>NOT RECOMMENDED</bcp14>",
&quot;<bcp14>SHOULD</bcp14>&quot;, &quot;<bcp14>SHOULD NOT</bcp14>&q "<bcp14>MAY</bcp14>", and "<bcp14>OPTIONAL</bcp14>" in this document
uot;, are to be interpreted as described in BCP&nbsp;14
&quot;<bcp14>RECOMMENDED</bcp14>&quot;, <xref target="RFC2119"/> <xref target="RFC8174"/> when, and only
&quot;<bcp14>NOT RECOMMENDED</bcp14>&quot;, &quot;<bcp14>MAY</bcp14> when, they appear in all capitals, as shown here.</t>
&quot;,
and &quot;<bcp14>OPTIONAL</bcp14>&quot; in this document are to be
interpreted as described in
BCP 14 <xref target="RFC2119"/> <xref target="RFC8174"/> when, and o
nly
when, they appear in all capitals, as shown here. </t>
</section> </section>
</section> </section>
<section anchor="specification" title="Specification"> <section anchor="specification" numbered="true" toc="default">
<section anchor="protocol-definition" title="Protocol Definition"> <name>Specification</name>
<section anchor="protocol-definition" numbered="true" toc="default">
<name>Protocol Definition</name>
<t> The protocol language consists of rule(s) and group(s) that the serv ice <t> The protocol language consists of rule(s) and group(s) that the serv ice
makes available in a file named &#39;robots.txt&#39; as described in makes available in a file named "robots.txt" as described in
<xref target="access-method" />: </t> <xref target="access-method" format="default"/>: </t>
<t> <dl spacing="normal">
<list style="symbols"> <dt> Rule:</dt><dd> A line with a key-value pair that defines how a
<t> Rule: A line with a key-value pair that defines how a
crawler may access URIs. See crawler may access URIs. See
<xref target="the-allow-and-disallow-lines" />. </t> <xref target="the-allow-and-disallow-lines" format="default"/>.
<t> Group: One or more user-agent lines that is followed by </dd>
<dt> Group:</dt><dd> One or more user-agent lines that are followed by
one or more rules. The group is terminated by a user-agent line one or more rules. The group is terminated by a user-agent line
or end of file. See <xref target="the-user-agent-line" />. or end of file. See <xref target="the-user-agent-line" format="d efault"/>.
The last group may have no rules, which means it implicitly The last group may have no rules, which means it implicitly
allows everything. </t> allows everything. </dd>
</list> </t> </dl>
</section> </section>
<section anchor="formal-syntax" title="Formal Syntax"> <section anchor="formal-syntax" numbered="true" toc="default">
<name>Formal Syntax</name>
<t> Below is an Augmented Backus-Naur Form (ABNF) description, as descri bed <t> Below is an Augmented Backus-Naur Form (ABNF) description, as descri bed
in <xref target="RFC5234"/>. </t> in <xref target="RFC5234" format="default"/>. </t>
<sourcecode name="" type="abnf"><![CDATA[
<figure><artwork> robotstxt = *(group / emptyline)
<![CDATA[ group = startgroupline ; We start with a user-agent
robotstxt = *(group / emptyline) ; line
group = startgroupline ; We start with a user-agent *(startgroupline / emptyline) ; ... and possibly more
*(startgroupline / emptyline) ; ... and possibly more ; user-agent lines
; user-agents *(rule / emptyline) ; followed by rules relevant
*(rule / emptyline) ; followed by rules relevant ; for the preceding
; for UAs ; user-agent lines
startgroupline = *WS "user-agent" *WS ":" *WS product-token EOL startgroupline = *WS "user-agent" *WS ":" *WS product-token EOL
rule = *WS ("allow" / "disallow") *WS ":" rule = *WS ("allow" / "disallow") *WS ":"
*WS (path-pattern / empty-pattern) EOL *WS (path-pattern / empty-pattern) EOL
; parser implementors: define additional lines you need (for ; parser implementors: define additional lines you need (for
; example, sitemaps). ; example, Sitemaps).
product-token = identifier / "*" product-token = identifier / "*"
path-pattern = "/" *UTF8-char-noctl ; valid URI path pattern path-pattern = "/" *UTF8-char-noctl ; valid URI path pattern
empty-pattern = *WS empty-pattern = *WS
identifier = 1*(%x2D / %x41-5A / %x5F / %x61-7A) identifier = 1*(%x2D / %x41-5A / %x5F / %x61-7A)
comment = "#" *(UTF8-char-noctl / WS / "#") comment = "#" *(UTF8-char-noctl / WS / "#")
emptyline = EOL emptyline = EOL
EOL = *WS [comment] NL ; end-of-line may have EOL = *WS [comment] NL ; end-of-line may have
; optional trailing comment ; optional trailing comment
NL = %x0D / %x0A / %x0D.0A NL = %x0D / %x0A / %x0D.0A
WS = %x20 / %x09 WS = %x20 / %x09
; UTF8 derived from RFC3629, but excluding control characters ; UTF8 derived from RFC 3629, but excluding control characters
UTF8-char-noctl = UTF8-1-noctl / UTF8-2 / UTF8-3 / UTF8-4 UTF8-char-noctl = UTF8-1-noctl / UTF8-2 / UTF8-3 / UTF8-4
UTF8-1-noctl = %x21 / %x22 / %x24-7F ; excluding control, space, '#' UTF8-1-noctl = %x21 / %x22 / %x24-7F ; excluding control, space, "#"
UTF8-2 = %xC2-DF UTF8-tail UTF8-2 = %xC2-DF UTF8-tail
UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2UTF8-tail / UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2UTF8-tail /
%xED %x80-9F UTF8-tail / %xEE-EF 2UTF8-tail %xED %x80-9F UTF8-tail / %xEE-EF 2UTF8-tail
UTF8-4 = %xF0 %x90-BF 2UTF8-tail / %xF1-F3 3UTF8-tail / UTF8-4 = %xF0 %x90-BF 2UTF8-tail / %xF1-F3 3UTF8-tail /
%xF4 %x80-8F 2UTF8-tail %xF4 %x80-8F 2UTF8-tail
UTF8-tail = %x80-BF UTF8-tail = %x80-BF
]]> ]]></sourcecode>
</artwork></figure> <section anchor="the-user-agent-line" numbered="true" toc="default">
<section anchor="the-user-agent-line" title="The User-Agent Line"> <name>The User-Agent Line</name>
<t> Crawlers set their own name, which is called a product token, to f ind <t> Crawlers set their own name, which is called a product token, to f ind
relevant groups. The product token <bcp14>MUST</bcp14> contain onl y relevant groups. The product token <bcp14>MUST</bcp14> contain onl y
upper and lowercase letters (&quot;a-z&quot; and &quot;A-Z&quot;), uppercase and lowercase letters ("a-z" and "A-Z"),
underscores (&quot;_&quot;), and hyphens (&quot;-&quot;). underscores ("_"), and hyphens ("-").
The product token <bcp14>SHOULD</bcp14> The product token <bcp14>SHOULD</bcp14>
be a substring of the identification string that the crawler sends to be a substring of the identification string that the crawler sends to
the service (for example, in the case of HTTP, the product token the service. For example, in the case of HTTP
<bcp14>SHOULD</bcp14> be a substring in the user-agent header). <xref target="RFC9110" format="default"/>, the product token
<bcp14>SHOULD</bcp14> be a substring in the User-Agent header.
The identification string <bcp14>SHOULD</bcp14> describe the purpo se of The identification string <bcp14>SHOULD</bcp14> describe the purpo se of
the crawler. Here&#39;s an example of a user-agent HTTP request he ader the crawler. Here's an example of a User-Agent HTTP request header
with a link pointing to a page describing the purpose of the with a link pointing to a page describing the purpose of the
ExampleBot crawler, which appears as a substring in the user-agent HTTP ExampleBot crawler, which appears as a substring in the User-Agent HTTP
header and as a product token in the robots.txt user-agent line: < /t> header and as a product token in the robots.txt user-agent line: < /t>
<texttable title="Example of a user-agent HTTP header and <figure anchor="fig-1">
robots.txt user-agent line for the ExampleBot produc <name>Example of a User-Agent HTTP header and
t token. robots.txt user-agent line for the ExampleBot product token</n
Note that the product token (ExampleBot) is a substr ame>
ing of the <artwork name="" type="" align="center" alt=""><![CDATA[
user-agent HTTP header"> +==========================================+========================+
<ttcol align="left">user-agent HTTP header</ttcol> | User-Agent HTTP header | robots.txt user-agent |
<ttcol align="left">robots.txt user-agent line</ttcol> | | line |
<c>user-agent: Mozilla/5.0 (compatible; ExampleBot/0.1; https://www. +==========================================+========================+
example.com/bot.html)</c> | User-Agent: Mozilla/5.0 (compatible; | user-agent: ExampleBot |
<c>user-agent: ExampleBot</c> | ExampleBot/0.1; | |
</texttable> | https://www.example.com/bot.html) | |
+------------------------------------------+------------------------+
]]></artwork>
</figure>
<t> Note that the product token (ExampleBot) is a substring of
the User-Agent HTTP header.</t>
<t> Crawlers <bcp14>MUST</bcp14> use case-insensitive matching <t> Crawlers <bcp14>MUST</bcp14> use case-insensitive matching
to find the group that matches the product token, and then to find the group that matches the product token and then
obey the rules of the group. If there is more than one obey the rules of the group. If there is more than one
group matching the user-agent, the matching groups' rules group matching the user-agent, the matching groups' rules
<bcp14>MUST</bcp14> be combined into one group and parsed <bcp14>MUST</bcp14> be combined into one group and parsed
according to according to
<xref target="the-allow-and-disallow-lines" />. </t> <xref target="the-allow-and-disallow-lines" format="default"/>.</t
>
<texttable title="Example of how to merge two robots.txt
groups that match the same product token">
<ttcol align="left">Two groups that match the same product token exa
ctly</ttcol>
<ttcol align="left">Merged group</ttcol>
<c>user-agent: ExampleBot<br />
disallow: /foo<br />
disallow: /bar<br />
<br />
user-agent: ExampleBot<br />
disallow: /baz
</c>
<c>user-agent: ExampleBot<br />
disallow: /foo<br />
disallow: /bar<br />
disallow: /baz</c>
</texttable>
<figure anchor="fig-2">
<name>Example of how to merge two robots.txt
groups that match the same product token</name>
<artwork name="" type="" align="center" alt=""><![CDATA[
+========================================+========================+
| Two groups that match the same product | Merged group |
| token exactly | |
+========================================+========================+
| user-agent: ExampleBot | user-agent: ExampleBot |
| disallow: /foo | disallow: /foo |
| disallow: /bar | disallow: /bar |
| | disallow: /baz |
| user-agent: ExampleBot | |
| disallow: /baz | |
+----------------------------------------+------------------------+
]]></artwork>
</figure>
<t> If no matching group exists, crawlers <bcp14>MUST</bcp14> obey the group <t> If no matching group exists, crawlers <bcp14>MUST</bcp14> obey the group
with a user-agent line with the "*" value, if present. </t> with a user-agent line with the "*" value, if present. </t>
<figure anchor="fig-3">
<texttable title="Example of no matching groups other than the '*' <name>Example of no matching groups other than the "*"
for the ExampleBot product token"> for the ExampleBot product token</name>
<ttcol align="left">Two groups that don't explicitly match ExampleBo <artwork name="" type="" align="center" alt=""><![CDATA[
t</ttcol> +==================================+======================+
<ttcol align="left">Applicable group for ExampleBot</ttcol> | Two groups that don't explicitly | Applicable group for |
<c>user-agent: *<br /> | match ExampleBot | ExampleBot |
disallow: /foo<br /> +==================================+======================+
disallow: /bar<br /> | user-agent: * | user-agent: * |
<br /> | disallow: /foo | disallow: /foo |
user-agent: BazBot<br /> | disallow: /bar | disallow: /bar |
disallow: /baz | | |
</c> | user-agent: BazBot | |
<c>user-agent: *<br /> | disallow: /baz | |
disallow: /foo<br /> +----------------------------------+----------------------+
disallow: /bar</c> ]]></artwork>
</texttable> </figure>
<t> If no group matches the product token and there is no group with a user-agent <t> If no group matches the product token and there is no group with a user-agent
line with the "*" value, or no groups are present at all, no line with the "*" value, or no groups are present at all, no
rules apply. </t> rules apply. </t>
</section> </section>
<section anchor="the-allow-and-disallow-lines" title="The Allow and Disa <section anchor="the-allow-and-disallow-lines" numbered="true" toc="defa
llow Lines"> ult">
<name>The &quot;Allow&quot; and &quot;Disallow&quot; Lines</name>
<t> These lines indicate whether accessing a URI that matches the <t> These lines indicate whether accessing a URI that matches the
corresponding path is allowed or disallowed. </t> corresponding path is allowed or disallowed. </t>
<t> To evaluate if access to a URI is allowed, a crawler <bcp14>MUST</ bcp14> <t> To evaluate if access to a URI is allowed, a crawler <bcp14>MUST</ bcp14>
match the paths in allow and disallow rules against the URI. match the paths in "allow" and "disallow" rules against the URI.
The matching <bcp14>SHOULD</bcp14> be case sensitive. The matching The matching <bcp14>SHOULD</bcp14> be case sensitive. The matching
<bcp14>MUST</bcp14> start with the first octet of the path. The mo st <bcp14>MUST</bcp14> start with the first octet of the path. The mo st
specific match found <bcp14>MUST</bcp14> be used. The most specifi c specific match found <bcp14>MUST</bcp14> be used. The most specifi c
match is the match that has the most octets. Duplicate rules in a match is the match that has the most octets. Duplicate rules in a
group <bcp14>MAY</bcp14> be deduplicated. If an allow and disallow group <bcp14>MAY</bcp14> be deduplicated. If an "allow" rule and a
rule are equivalent, then the allow rule <bcp14>SHOULD</bcp14> be "disallow"
used. If no rule are equivalent, then the "allow" rule <bcp14>SHOULD</bcp14> b
match is found amongst the rules in a group for a matching user-ag e used. If no
ent, match is found amongst the rules in a group for a matching user-ag
ent
or there are no rules in the group, the URI is allowed. The or there are no rules in the group, the URI is allowed. The
/robots.txt URI is implicitly allowed. </t> /robots.txt URI is implicitly allowed. </t>
<t> Octets in the URI and robots.txt paths outside the range of the <t> Octets in the URI and robots.txt paths outside the range of the
US-ASCII coded character set, and those in the reserved range defi ASCII coded character set, and those in the reserved range defined
ned by <xref target="RFC3986" format="default"/>, <bcp14>MUST</bcp14>
by <xref target="RFC3986"/>, <bcp14>MUST</bcp14> be percent-encode be percent-encoded as
d as defined by <xref target="RFC3986" format="default"/> prior to comp
defined by <xref target="RFC3986"></xref> prior to comparison. </t arison. </t>
> <t> If a percent-encoded ASCII octet is encountered in the URI, it
<t> If a percent-encoded US-ASCII octet is encountered in the URI, it
<bcp14>MUST</bcp14> be unencoded prior to comparison, unless it is a <bcp14>MUST</bcp14> be unencoded prior to comparison, unless it is a
reserved character in the URI as defined by <xref target="RFC3986" /> reserved character in the URI as defined by <xref target="RFC3986" format="default"/>
or the character is outside the unreserved character range. The ma tch or the character is outside the unreserved character range. The ma tch
evaluates positively if and only if the end of the path from the r ule evaluates positively if and only if the end of the path from the r ule
is reached before a difference in octets is encountered. </t> is reached before a difference in octets is encountered. </t>
<t> For example: </t> <t> For example: </t>
<texttable title="Examples of matching percent-encoded URI components" <figure anchor="fig-4">
> <name>Examples of matching percent-encoded URI components</name>
<ttcol align='left'>Path</ttcol> <artwork name="" type="" align="center" alt=""><![CDATA[
<ttcol align='left'>Encoded Path</ttcol> +==================+=======================+=======================+
<ttcol align='left'>Path to Match</ttcol> | Path | Encoded Path | Path to Match |
<c>/foo/bar?baz=quz</c> +==================+=======================+=======================+
<c>/foo/bar?baz=quz</c> | /foo/bar?baz=quz | /foo/bar?baz=quz | /foo/bar?baz=quz |
<c>/foo/bar?baz=quz</c> +------------------+-----------------------+-----------------------+
<c>/foo/bar?baz=http<br />://foo.bar</c> | /foo/bar?baz= | /foo/bar?baz= | /foo/bar?baz= |
<c>/foo/bar?baz=http%3A<br />%2F%2Ffoo.bar</c> | https://foo.bar | https%3A%2F%2Ffoo.bar | https%3A%2F%2Ffoo.bar |
<c>/foo/bar?baz=http%3A<br />%2F%2Ffoo.bar</c> +------------------+-----------------------+-----------------------+
<c>/foo/bar/U+E38384</c> | /foo/bar/ | /foo/bar/%E3%83%84 | /foo/bar/%E3%83%84 |
<c>/foo/bar/%E3%83%84</c> | U+E38384 | | |
<c>/foo/bar/%E3%83%84</c> +------------------+-----------------------+-----------------------+
<c>/foo/bar/%E3%83%84</c> | /foo/ | /foo/bar/%E3%83%84 | /foo/bar/%E3%83%84 |
<c>/foo/bar/%E3%83%84</c> | bar/%E3%83%84 | | |
<c>/foo/bar/%E3%83%84</c> +------------------+-----------------------+-----------------------+
<c>/foo/bar/%62%61%7A</c> | /foo/ | /foo/bar/%62%61%7A | /foo/bar/baz |
<c>/foo/bar/%62%61%7A</c> | bar/%62%61%7A | | |
<c>/foo/bar/baz</c> +------------------+-----------------------+-----------------------+
</texttable> ]]></artwork>
</figure>
<t> The crawler <bcp14>SHOULD</bcp14> ignore &quot;disallow&quot; and <t> The crawler <bcp14>SHOULD</bcp14> ignore "disallow" and
&quot;allow&quot; rules that are not in any group (for example, an "allow" rules that are not in any group (for example, any
y
rule that precedes the first user-agent line). </t> rule that precedes the first user-agent line). </t>
<t> Implementors <bcp14>MAY</bcp14> bridge encoding mismatches if they
<t> Implementers <bcp14>MAY</bcp14> bridge encoding mismatches if they detect that the robots.txt file is not UTF-8 encoded. </t>
detect that the robots.txt file is not UTF8 encoded. </t>
</section> </section>
<section anchor="special-characters" title="Special Characters"> <section anchor="special-characters" numbered="true" toc="default">
<t> Crawlers <bcp14>MUST</bcp14> allow the following special character <name>Special Characters</name>
s: </t> <t> Crawlers <bcp14>MUST</bcp14> support the following special charact
ers: </t>
<texttable title="List of special characters in robots.txt files"> <figure anchor="fig-5">
<ttcol align='left'>Character</ttcol> <name>List of special characters in robots.txt files</name>
<ttcol align='left'>Description</ttcol> <artwork name="" type="" align="center" alt=""><![CDATA[
<ttcol align='left'>Example</ttcol> +===========+===================+==============================+
<c>&quot;#&quot;</c> | Character | Description | Example |
<c>Designates an end of line comment.</c> +===========+===================+==============================+
<c>&quot;allow: / # comment in line&quot;<br /><br />&quot;# comment | # | Designates a line | allow: / # comment in line |
on its own line&quot;</c> | | comment. | |
<c>&quot;$&quot;</c> | | | # comment on its own line |
<c>Designates the end of the match pattern.</c> +-----------+-------------------+------------------------------+
<c>&quot;allow: /this/path/exactly$&quot;</c> | $ | Designates the | allow: /this/path/exactly$ |
<c>&quot;*&quot;</c> | | end of the match | |
<c>Designates 0 or more instances of any character.</c> | | pattern. | |
<c>&quot;allow: /this/*/exactly&quot;</c> +-----------+-------------------+------------------------------+
</texttable> | * | Designates 0 or | allow: /this/*/exactly |
| | more instances of | |
| | any character. | |
+-----------+-------------------+------------------------------+
]]></artwork>
</figure>
<t> If crawlers match special characters verbatim in the URI, crawlers <t> If crawlers match special characters verbatim in the URI, crawlers
<bcp14>SHOULD</bcp14> use &quot;%&quot; encoding. For example: </t <bcp14>SHOULD</bcp14> use "%" encoding. For example: </t>
> <figure anchor="fig-6">
<name>Example of percent-encoding</name>
<texttable title="Example of percent-encoding"> <artwork name="" type="" align="center" alt=""><![CDATA[
<ttcol align='left'>Percent-encoded Pattern</ttcol> +============================+====================================+
<ttcol align='left'>URI</ttcol> | Percent-encoded Pattern | URI |
<c>/path/file-with-a-%2A.html</c> +============================+====================================+
<c>https://www.example.com/path/file-with-a-*.html</c> | /path/file-with-a-%2A.html | https://www.example.com/path/ |
<c>/path/foo-%24</c> | | file-with-a-*.html |
<c>https://www.example.com/path/foo-$</c> +----------------------------+------------------------------------+
</texttable> | /path/foo-%24 | https://www.example.com/path/foo-$ |
+----------------------------+------------------------------------+
]]></artwork>
</figure>
</section> </section>
<section anchor="other-records" title="Other Records"> <section anchor="other-records" numbered="true" toc="default">
<name>Other Records</name>
<t> Crawlers <bcp14>MAY</bcp14> interpret other records that are not <t> Crawlers <bcp14>MAY</bcp14> interpret other records that are not
part of the robots.txt protocol. For example, &#39;sitemap&#39; part of the robots.txt protocol -- for example, "Sitemaps"
<xref target="SITEMAPS"/>. Crawlers MAY be lenient when <xref target="SITEMAPS" format="default"/>. Crawlers <bcp14>MAY</b
cp14> be lenient when
interpreting other records. For example, crawlers may accept interpreting other records. For example, crawlers may accept
common typos of the record. </t> common misspellings of the record. </t>
<t> Parsing of other records <t> Parsing of other records
<bcp14>MUST NOT</bcp14> interfere with the parsing of explicitly <bcp14>MUST NOT</bcp14> interfere with the parsing of explicitly
defined records in <xref target="specification" />. </t> defined records in <xref target="specification" format="default"/>
.
For example, a "Sitemaps" record <bcp14>MUST NOT</bcp14> terminate
a
group. </t>
</section> </section>
</section> </section>
<section anchor="access-method" title="Access Method"> <section anchor="access-method" numbered="true" toc="default">
<t> The rules <bcp14>MUST</bcp14> be accessible in a file named <name>Access Method</name>
&quot;/robots.txt&quot; (all lower case) in the top level path of <t> The rules <bcp14>MUST</bcp14> be accessible in a file named
"/robots.txt" (all lowercase) in the top-level path of
the service. The file <bcp14>MUST</bcp14> be UTF-8 encoded (as the service. The file <bcp14>MUST</bcp14> be UTF-8 encoded (as
defined in <xref target="RFC3629"/>) and Internet Media Type defined in <xref target="RFC3629" format="default"/>) and Internet Med
&quot;text/plain&quot; ia Type
(as defined in <xref target="RFC2046"/>). </t> "text/plain"
<t> As per <xref target="RFC3986"/>, the URI of the robots.txt is: </t> (as defined in <xref target="RFC2046" format="default"/>). </t>
<t> &quot;scheme:[//authority]/robots.txt&quot; </t> <t> As per <xref target="RFC3986" format="default"/>, the URI of the rob
<t> For example, in the context of HTTP or FTP, the URI is: </t> ots.txt file is: </t>
<t> "scheme:[//authority]/robots.txt" </t>
<figure> <t> For example, in the context of HTTP or FTP, the URI is: </t>
<artwork><![CDATA[ <artwork name="" type="" align="left" alt=""><![CDATA[
https://www.example.com/robots.txt https://www.example.com/robots.txt
ftp://ftp.example.com/robots.txt ftp://ftp.example.com/robots.txt
]]></artwork> ]]></artwork>
</figure> <section anchor="access-results" numbered="true" toc="default">
<name>Access Results</name>
<section anchor="access-results" title="Access Results"> <section anchor="successful-access" numbered="true" toc="default">
<section anchor="successful-access" title="Successful Access"> <name>Successful Access</name>
<t> If the crawler successfully downloads the robots.txt, the <t> If the crawler successfully downloads the robots.txt file, the
crawler <bcp14>MUST</bcp14> follow the parseable rules. </t> crawler <bcp14>MUST</bcp14> follow the parseable rules. </t>
</section> </section>
<section anchor="redirects" title="Redirects"> <section anchor="redirects" numbered="true" toc="default">
<t> It's possible that a server responds to a robots.txt fetch <name>Redirects</name>
request with a redirect, such as HTTP 301 and HTTP 302 in <t> It's possible that a server responds to a robots.txt fetch
request with a redirect, such as HTTP 301 or HTTP 302 in the
case of HTTP. The crawlers <bcp14>SHOULD</bcp14> follow at case of HTTP. The crawlers <bcp14>SHOULD</bcp14> follow at
least five consecutive redirects, even across authorities least five consecutive redirects, even across authorities
(for example, hosts in case of HTTP), as defined in (for example, hosts in the case of HTTP). </t>
<xref target="RFC1945"/>. </t> <t> If a robots.txt file is reached within five consecutive
<t> If a robots.txt file is reached within five consecutive
redirects, the robots.txt file <bcp14>MUST</bcp14> be fetched, redirects, the robots.txt file <bcp14>MUST</bcp14> be fetched,
parsed, and its rules followed in the context of the initial parsed, and its rules followed in the context of the initial
authority. </t> authority. </t>
<t> If there are more than five consecutive redirects, crawlers <t> If there are more than five consecutive redirects, crawlers
<bcp14>MAY</bcp14> assume that the robots.txt is <bcp14>MAY</bcp14> assume that the robots.txt file is
unavailable. </t> unavailable. </t>
</section> </section>
<section anchor="unavailable-status" title="Unavailable Status"> <section anchor="unavailable-status" numbered="true" toc="default">
<t> Unavailable means the crawler tries to fetch the robots.txt, <name>&quot;Unavailable&quot; Status</name>
and the server responds with unavailable status codes. For <t> "Unavailable" means the crawler tries to fetch the robots.txt fi
example, in the context of HTTP, unavailable status codes are le
and the server responds with status codes indicating that the reso
urce in question is unavailable. For
example, in the context of HTTP, such status codes are
in the 400-499 range. </t> in the 400-499 range. </t>
<t> If a server status code indicates that the robots.txt file is
<t> If a server status code indicates that the robots.txt file is unavailable to the crawler, then the crawler <bcp14>MAY</bcp14> ac
unavailable to the crawler, then the crawler MAY access any cess any
resources on the server. </t> resources on the server. </t>
</section> </section>
<section anchor="unreachable-status" title="Unreachable Status"> <section anchor="unreachable-status" numbered="true" toc="default">
<t> If the robots.txt is unreachable due to server or network <name>&quot;Unreachable&quot; Status</name>
errors, this means the robots.txt is undefined and the crawler <t> If the robots.txt file is unreachable due to server or network
errors, this means the robots.txt file is undefined and the crawle
r
<bcp14>MUST</bcp14> assume complete disallow. For example, in <bcp14>MUST</bcp14> assume complete disallow. For example, in
the context of HTTP, an unreachable robots.txt has a response the context of HTTP, server errors are identified by status codes
code in the 500-599 range. </t> in the 500-599 range. </t>
<t> If the robots.txt is undefined for a reasonably long period of <t> If the robots.txt file is undefined for a reasonably long period
time (for example, 30 days), crawlers <bcp14>MAY</bcp14> assume of
the robots.txt is unavailable as defined in time (for example, 30 days), crawlers <bcp14>MAY</bcp14> assume th
<xref target="unavailable-status"/> or continue to use a cached at
the robots.txt file is unavailable as defined in
<xref target="unavailable-status" format="default"/> or continue t
o use a cached
copy. </t> copy. </t>
</section> </section>
<section anchor="parsing-errors" title="Parsing Errors"> <section anchor="parsing-errors" numbered="true" toc="default">
<t> Crawlers <bcp14>MUST</bcp14> try to parse each line of the <name>Parsing Errors</name>
<t> Crawlers <bcp14>MUST</bcp14> try to parse each line of the
robots.txt file. Crawlers <bcp14>MUST</bcp14> use the parseable robots.txt file. Crawlers <bcp14>MUST</bcp14> use the parseable
rules. </t> rules. </t>
</section>
</section> </section>
</section> </section>
</section> <section anchor="caching" numbered="true" toc="default">
<section anchor="caching" title="Caching"> <name>Caching</name>
<t> Crawlers <bcp14>MAY</bcp14> cache the fetched robots.txt file&#39;s <t> Crawlers <bcp14>MAY</bcp14> cache the fetched robots.txt file's
contents. Crawlers <bcp14>MAY</bcp14> use standard cache control as contents. Crawlers <bcp14>MAY</bcp14> use standard cache control as
defined in <xref target="RFC9111"/>. Crawlers defined in <xref target="RFC9111" format="default"/>. Crawlers
<bcp14>SHOULD NOT</bcp14> use the cached version for more than 24 <bcp14>SHOULD NOT</bcp14> use the cached version for more than 24
hours, unless the robots.txt is unreachable. </t> hours, unless the robots.txt file is unreachable. </t>
</section> </section>
<section anchor="limits" title="Limits"> <section anchor="limits" numbered="true" toc="default">
<t> Crawlers SHOULD impose a parsing limit to protect their systems; <name>Limits</name>
see <xref target="security"/>. The parsing limit MUST be at least <t> Crawlers <bcp14>SHOULD</bcp14> impose a parsing limit to protect the
500 kibibytes <xref target="KiB"/>. </t> ir systems;
see <xref target="security" format="default"/>. The parsing limit <bcp
14>MUST</bcp14> be at least
500 kibibytes <xref target="KiB" format="default"/>. </t>
</section> </section>
</section> </section>
<section anchor="security" title="Security Considerations"> <section anchor="security" numbered="true" toc="default">
<t> The Robots Exclusion Protocol is not a substitute for more valid <name>Security Considerations</name>
<t> The Robots Exclusion Protocol is not a substitute for valid
content security measures. Listing paths in the robots.txt file content security measures. Listing paths in the robots.txt file
exposes them publicly and thus makes the paths discoverable. To exposes them publicly and thus makes the paths discoverable. To
control access to the URI paths in a robots.txt file, users of control access to the URI paths in a robots.txt file, users of
the protocol should employ a valid security measure relevant to the protocol should employ a valid security measure relevant to
the application layer on which the robots.txt file is served. the application layer on which the robots.txt file is served --
For example, in case of HTTP, HTTP Authentication defined in for example, in the case of HTTP, HTTP Authentication as defined in
<xref target="RFC9110"/>. </t> <xref target="RFC9110" format="default"/>. </t>
<t> To protect against attacks against their system, implementors <t> To protect against attacks against their system, implementors
of robots.txt parsing and matching logic should take the of robots.txt parsing and matching logic should take the
following considerations into account: </t> following considerations into account: </t>
<t> <dl spacing="normal">
<list style="symbols"> <dt> Memory management:</dt><dd> <xref target="limits" format="default"/
<t> Memory management: <xref target="limits" /> defines the lower > defines the lower
limit of bytes that must be processed, which inherently also limit of bytes that must be processed, which inherently also
protects the parser from out of memory scenarios. </t> protects the parser from out-of-memory scenarios. </dd>
<t> Invalid characters: <xref target="formal-syntax" /> defines <dt> Invalid characters:</dt><dd> <xref target="formal-syntax" format="d
efault"/> defines
a set of characters that parsers and matchers can expect in a set of characters that parsers and matchers can expect in
robots.txt files. Out of bound characters should be rejected robots.txt files. Out-of-bound characters should be rejected
as invalid, which limits the available attack vectors that as invalid, which limits the available attack vectors that
attempt to compromise the system. </t> attempt to compromise the system. </dd>
<t> Untrusted content: Implementors should treat the content of <dt> Untrusted content:</dt><dd> Implementors should treat the content o
f
a robots.txt file as untrusted content, as defined by the a robots.txt file as untrusted content, as defined by the
specification of the application layer used. For example, specification of the application layer used. For example,
in the context of HTTP, implementors should follow the in the context of HTTP, implementors should follow the
security considerations section of Security Considerations section of
<xref target="RFC9110"/>. </t> <xref target="RFC9110" format="default"/>. </dd>
</list> </dl>
</t>
</section> </section>
<section anchor="IANA" title="IANA Considerations"> <section anchor="IANA" numbered="true" toc="default">
<t> This document has no actions for IANA. </t> <name>IANA Considerations</name>
<t> This document has no IANA actions. </t>
</section> </section>
<section anchor="examples" title="Examples"> <section anchor="examples" numbered="true" toc="default">
<section anchor="simple-example" title="Simple Example"> <name>Examples</name>
<section anchor="simple-example" numbered="true" toc="default">
<name>Simple Example</name>
<t> The following example shows: </t> <t> The following example shows: </t>
<t> <dl spacing="normal">
<list style="symbols"> <dt> *:</dt><dd> A group that's relevant to all user agents that
<t> *: A group that's relevant to all user-agents that
don't have an explicitly defined matching group. It allows don't have an explicitly defined matching group. It allows
access to the URLs with the /publications/ path prefix, and access to the URLs with the /publications/ path prefix, and it
restricts access to the URLs with the /example/ path prefix restricts access to the URLs with the /example/ path prefix
and to all URLs with .gif suffix. The * character designates and to all URLs with a .gif suffix. The "*" character designates
any character, including the otherwise required forward any character, including the otherwise-required forward
slash; see <xref target="formal-syntax" />. </t> slash; see <xref target="formal-syntax" format="default"/>. </dd
<t> foobot: A regular case. A single user-agent followed >
<dt> foobot:</dt><dd> A regular case. A single user agent followed
by rules. The crawler only has access to two URL path by rules. The crawler only has access to two URL path
prefixes on the site, /example/page.html and prefixes on the site -- /example/page.html and
/example/allowed.gif. The rules of the group are missing /example/allowed.gif. The rules of the group are missing
the optional whitespace character, which is acceptable as the optional space character, which is acceptable as
defined in <xref target="formal-syntax" />. </t> defined in <xref target="formal-syntax" format="default"/>. </dd
<t> barbot and bazbot: A group that&#39;s relevant for more >
than one user-agent. The crawlers are not allowed to access <dt> barbot and bazbot:</dt><dd> A group that's relevant for more
the URLs with the /example/page.html path prefix, but than one user agent. The crawlers are not allowed to access
the URLs with the /example/page.html path prefix but
otherwise have unrestricted access to the rest of the URLs otherwise have unrestricted access to the rest of the URLs
on the site. </t> on the site. </dd>
<t> quxbot: An empty group at end of the file. The crawler has <dt> quxbot:</dt><dd> An empty group at the end of the file. The crawl
unrestricted access to the URLs on the site. </t> er has
</list> unrestricted access to the URLs on the site. </dd>
</t> </dl>
<figure> <artwork name="" type="" align="left" alt=""><![CDATA[
<artwork><![CDATA[ User-Agent: *
User-agent: *
Disallow: *.gif$ Disallow: *.gif$
Disallow: /example/ Disallow: /example/
Allow: /publications/ Allow: /publications/
User-Agent: foobot User-Agent: foobot
Disallow:/ Disallow:/
Allow:/example/page.html Allow:/example/page.html
Allow:/example/allowed.gif Allow:/example/allowed.gif
User-Agent: barbot User-Agent: barbot
User-Agent: bazbot User-Agent: bazbot
Disallow: /example/page.html Disallow: /example/page.html
User-Agent: quxbot User-Agent: quxbot
EOF EOF
]]></artwork> ]]></artwork>
</figure>
</section> </section>
<section anchor="longest-match" title="Longest Match"> <section anchor="longest-match" numbered="true" toc="default">
<name>Longest Match</name>
<t> The following example shows that in the case of two rules, the <t> The following example shows that in the case of two rules, the
longest one is used for matching. In the following case, longest one is used for matching. In the following case,
/example/page/disallowed.gif <bcp14>MUST</bcp14> be used for /example/page/disallowed.gif <bcp14>MUST</bcp14> be used for
the URI example.com/example/page/disallow.gif. </t> the URI example.com/example/page/disallow.gif. </t>
<figure> <artwork name="" type="" align="left" alt=""><![CDATA[
<artwork><![CDATA[
User-Agent: foobot User-Agent: foobot
Allow: /example/page/ Allow: /example/page/
Disallow: /example/page/disallowed.gif Disallow: /example/page/disallowed.gif
]]></artwork> ]]></artwork>
</figure>
</section> </section>
</section> </section>
</middle> </middle>
<back> <back>
<references title='Normative References'> <references>
&RFC1945; <name>References</name>
&RFC2046; <references>
&RFC2119; <name>Normative References</name>
&RFC3629; <xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.2
&RFC3986; 046.xml"/>
&RFC5234; <xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.2
&RFC8174; 119.xml"/>
&RFC8288; <xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.3
&RFC9110; 629.xml"/>
&RFC9111; <xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.3
</references> 986.xml"/>
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.5
234.xml"/>
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.8
174.xml"/>
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.8
288.xml"/>
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.9
110.xml"/>
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.9
111.xml"/>
</references>
<references>
<name>Informative References</name>
<references title='Informative References'> <reference anchor="ROBOTSTXT" target="https://www.robotstxt.org/">
<reference anchor="ROBOTSTXT" target="http://www.robotstxt.org/"> <front>
<front> <title>The Web Robots Pages (including /robots.txt)</title>
<title>Robots Exclusion Protocol</title> <author>
<author> <organization/>
<organization></organization> </author>
</author> <date>2007</date>
<date year="n.d."/> </front>
</front> </reference>
</reference> <reference anchor="SITEMAPS" target="https://www.sitemaps.org/index.html
<reference anchor="SITEMAPS" target="https://www.sitemaps.org/index.html"> ">
<front> <front>
<title>Sitemaps Protocol</title> <title>What are Sitemaps? (Sitemap protocol)</title>
<author> <author>
<organization></organization> <organization/>
</author> </author>
<date year="n.d."/> <date>April 2020</date>
</front> </front>
</reference> </reference>
<reference anchor="KiB" target="https://simple.wikipedia.org/wiki/Kibibyte <reference anchor="KiB" target="https://simple.wikipedia.org/wiki/Kibiby
"> te">
<front> <front>
<title>Kibibyte - Simple English Wikipedia, the free encyclopedia</tit <title>Kibibyte</title>
le> <author>
<author> <organization/>
<organization></organization> </author>
</author> <date day="17" month="September" year="2020"/>
<date year="n.d."/> </front>
</front> <refcontent>Simple English Wikipedia, the free encyclopedia</refconten
</reference> t>
</reference>
</references>
</references> </references>
</back> </back>
</rfc> </rfc>
 End of changes. 104 change blocks. 
425 lines changed or deleted 438 lines changed or added

This html diff was produced by rfcdiff 1.48.