<?xml version='1.0' encoding='utf-8'?>
<rfc xmlns:xi="http://www.w3.org/2001/XInclude" version="3" ipr="trust200902" docName="draft-ietf-jsonpath-iregexp-08" number="9485" submissionType="IETF" category="std" consensus="true" tocInclude="true" sortRefs="true" symRefs="true" updates="" obsoletes="" xml:lang="en" prepTime="2023-10-13T15:09:04" indexInclude="true" scripts="Common,Latin" tocDepth="3">
  <link href="https://datatracker.ietf.org/doc/draft-ietf-jsonpath-iregexp-08" rel="prev"/>
  <link href="https://dx.doi.org/10.17487/rfc9485" rel="alternate"/>
  <link href="urn:issn:2070-1721" rel="alternate"/>
  <front>
    <title abbrev="I-Regexp">I-Regexp: An Interoperable Regular Expression Format</title>
    <seriesInfo name="RFC" value="9485" stream="IETF"/>
    <author initials="C." surname="Bormann" fullname="Carsten Bormann">
      <organization showOnFrontPage="true">Universität Bremen TZI</organization>
      <address>
        <postal>
          <street>Postfach 330440</street>
          <city>Bremen</city>
          <code>D-28359</code>
          <country>Germany</country>
        </postal>
        <phone>+49-421-218-63921</phone>
        <email>cabo@tzi.org</email>
      </address>
    </author>
    <author initials="T." surname="Bray" fullname="Tim Bray">
      <organization showOnFrontPage="true">Textuality</organization>
      <address>
        <postal>
          <country>Canada</country>
        </postal>
        <email>tbray@textuality.com</email>
      </address>
    </author>
    <date month="10" year="2023"/>
    <area>art</area>
    <workgroup>jsonpath</workgroup>
    <keyword>Regexp</keyword>
    <keyword>Regex</keyword>
    <abstract pn="section-abstract">
      <t indent="0" pn="section-abstract-1">This document specifies I-Regexp, a flavor of regular expression that is
limited in scope with the goal of interoperation across many different
regular expression libraries.</t>
    </abstract>
    <boilerplate>
      <section anchor="status-of-memo" numbered="false" removeInRFC="false" toc="exclude" pn="section-boilerplate.1">
        <name slugifiedName="name-status-of-this-memo">Status of This Memo</name>
        <t indent="0" pn="section-boilerplate.1-1">
            This is an Internet Standards Track document.
        </t>
        <t indent="0" pn="section-boilerplate.1-2">
            This document is a product of the Internet Engineering Task Force
            (IETF).  It represents the consensus of the IETF community.  It has
            received public review and has been approved for publication by
            the Internet Engineering Steering Group (IESG).  Further
            information on Internet Standards is available in Section 2 of 
            RFC 7841.
        </t>
        <t indent="0" pn="section-boilerplate.1-3">
            Information about the current status of this document, any
            errata, and how to provide feedback on it may be obtained at
            <eref target="https://www.rfc-editor.org/info/rfc9485" brackets="none"/>.
        </t>
      </section>
      <section anchor="copyright" numbered="false" removeInRFC="false" toc="exclude" pn="section-boilerplate.2">
        <name slugifiedName="name-copyright-notice">Copyright Notice</name>
        <t indent="0" pn="section-boilerplate.2-1">
            Copyright (c) 2023 IETF Trust and the persons identified as the
            document authors. All rights reserved.
        </t>
        <t indent="0" pn="section-boilerplate.2-2">
            This document is subject to BCP 78 and the IETF Trust's Legal
            Provisions Relating to IETF Documents
            (<eref target="https://trustee.ietf.org/license-info" brackets="none"/>) in effect on the date of
            publication of this document. Please review these documents
            carefully, as they describe your rights and restrictions with
            respect to this document. Code Components extracted from this
            document must include Revised BSD License text as described in
            Section 4.e of the Trust Legal Provisions and are provided without
            warranty as described in the Revised BSD License.
        </t>
      </section>
    </boilerplate>
    <toc>
      <section anchor="toc" numbered="false" removeInRFC="false" toc="exclude" pn="section-toc.1">
        <name slugifiedName="name-table-of-contents">Table of Contents</name>
        <ul bare="true" empty="true" indent="2" spacing="compact" pn="section-toc.1-1">
          <li pn="section-toc.1-1.1">
            <t indent="0" keepWithNext="true" pn="section-toc.1-1.1.1"><xref derivedContent="1" format="counter" sectionFormat="of" target="section-1"/>.  <xref derivedContent="" format="title" sectionFormat="of" target="name-introduction">Introduction</xref></t>
            <ul bare="true" empty="true" indent="2" spacing="compact" pn="section-toc.1-1.1.2">
              <li pn="section-toc.1-1.1.2.1">
                <t indent="0" keepWithNext="true" pn="section-toc.1-1.1.2.1.1"><xref derivedContent="1.1" format="counter" sectionFormat="of" target="section-1.1"/>.  <xref derivedContent="" format="title" sectionFormat="of" target="name-terminology">Terminology</xref></t>
              </li>
            </ul>
          </li>
          <li pn="section-toc.1-1.2">
            <t indent="0" keepWithNext="true" pn="section-toc.1-1.2.1"><xref derivedContent="2" format="counter" sectionFormat="of" target="section-2"/>.  <xref derivedContent="" format="title" sectionFormat="of" target="name-objectives">Objectives</xref></t>
          </li>
          <li pn="section-toc.1-1.3">
            <t indent="0" pn="section-toc.1-1.3.1"><xref derivedContent="3" format="counter" sectionFormat="of" target="section-3"/>.  <xref derivedContent="" format="title" sectionFormat="of" target="name-i-regexp-syntax">I-Regexp Syntax</xref></t>
            <ul bare="true" empty="true" indent="2" spacing="compact" pn="section-toc.1-1.3.2">
              <li pn="section-toc.1-1.3.2.1">
                <t indent="0" pn="section-toc.1-1.3.2.1.1"><xref derivedContent="3.1" format="counter" sectionFormat="of" target="section-3.1"/>.  <xref derivedContent="" format="title" sectionFormat="of" target="name-checking-implementations">Checking Implementations</xref></t>
              </li>
            </ul>
          </li>
          <li pn="section-toc.1-1.4">
            <t indent="0" pn="section-toc.1-1.4.1"><xref derivedContent="4" format="counter" sectionFormat="of" target="section-4"/>.  <xref derivedContent="" format="title" sectionFormat="of" target="name-i-regexp-semantics">I-Regexp Semantics</xref></t>
          </li>
          <li pn="section-toc.1-1.5">
            <t indent="0" pn="section-toc.1-1.5.1"><xref derivedContent="5" format="counter" sectionFormat="of" target="section-5"/>.  <xref derivedContent="" format="title" sectionFormat="of" target="name-mapping-i-regexp-to-regexp-">Mapping I-Regexp to Regexp Dialects</xref></t>
            <ul bare="true" empty="true" indent="2" spacing="compact" pn="section-toc.1-1.5.2">
              <li pn="section-toc.1-1.5.2.1">
                <t indent="0" pn="section-toc.1-1.5.2.1.1"><xref derivedContent="5.1" format="counter" sectionFormat="of" target="section-5.1"/>.  <xref derivedContent="" format="title" sectionFormat="of" target="name-multi-character-escapes">Multi-Character Escapes</xref></t>
              </li>
              <li pn="section-toc.1-1.5.2.2">
                <t indent="0" pn="section-toc.1-1.5.2.2.1"><xref derivedContent="5.2" format="counter" sectionFormat="of" target="section-5.2"/>.  <xref derivedContent="" format="title" sectionFormat="of" target="name-xsd-regexps">XSD Regexps</xref></t>
              </li>
              <li pn="section-toc.1-1.5.2.3">
                <t indent="0" pn="section-toc.1-1.5.2.3.1"><xref derivedContent="5.3" format="counter" sectionFormat="of" target="section-5.3"/>.  <xref derivedContent="" format="title" sectionFormat="of" target="name-ecmascript-regexps">ECMAScript Regexps</xref></t>
              </li>
              <li pn="section-toc.1-1.5.2.4">
                <t indent="0" pn="section-toc.1-1.5.2.4.1"><xref derivedContent="5.4" format="counter" sectionFormat="of" target="section-5.4"/>.  <xref derivedContent="" format="title" sectionFormat="of" target="name-pcre-re2-and-ruby-regexps">PCRE, RE2, and Ruby Regexps</xref></t>
              </li>
            </ul>
          </li>
          <li pn="section-toc.1-1.6">
            <t indent="0" pn="section-toc.1-1.6.1"><xref derivedContent="6" format="counter" sectionFormat="of" target="section-6"/>.  <xref derivedContent="" format="title" sectionFormat="of" target="name-motivation-and-background">Motivation and Background</xref></t>
            <ul bare="true" empty="true" indent="2" spacing="compact" pn="section-toc.1-1.6.2">
              <li pn="section-toc.1-1.6.2.1">
                <t indent="0" pn="section-toc.1-1.6.2.1.1"><xref derivedContent="6.1" format="counter" sectionFormat="of" target="section-6.1"/>.  <xref derivedContent="" format="title" sectionFormat="of" target="name-implementing-i-regexp">Implementing I-Regexp</xref></t>
              </li>
            </ul>
          </li>
          <li pn="section-toc.1-1.7">
            <t indent="0" pn="section-toc.1-1.7.1"><xref derivedContent="7" format="counter" sectionFormat="of" target="section-7"/>.  <xref derivedContent="" format="title" sectionFormat="of" target="name-iana-considerations">IANA Considerations</xref></t>
          </li>
          <li pn="section-toc.1-1.8">
            <t indent="0" pn="section-toc.1-1.8.1"><xref derivedContent="8" format="counter" sectionFormat="of" target="section-8"/>.  <xref derivedContent="" format="title" sectionFormat="of" target="name-security-considerations">Security Considerations</xref></t>
          </li>
          <li pn="section-toc.1-1.9">
            <t indent="0" pn="section-toc.1-1.9.1"><xref derivedContent="9" format="counter" sectionFormat="of" target="section-9"/>.  <xref derivedContent="" format="title" sectionFormat="of" target="name-references">References</xref></t>
            <ul bare="true" empty="true" indent="2" spacing="compact" pn="section-toc.1-1.9.2">
              <li pn="section-toc.1-1.9.2.1">
                <t indent="0" pn="section-toc.1-1.9.2.1.1"><xref derivedContent="9.1" format="counter" sectionFormat="of" target="section-9.1"/>.  <xref derivedContent="" format="title" sectionFormat="of" target="name-normative-references">Normative References</xref></t>
              </li>
              <li pn="section-toc.1-1.9.2.2">
                <t indent="0" pn="section-toc.1-1.9.2.2.1"><xref derivedContent="9.2" format="counter" sectionFormat="of" target="section-9.2"/>.  <xref derivedContent="" format="title" sectionFormat="of" target="name-informative-references">Informative References</xref></t>
              </li>
            </ul>
          </li>
          <li pn="section-toc.1-1.10">
            <t indent="0" pn="section-toc.1-1.10.1"><xref derivedContent="" format="none" sectionFormat="of" target="section-appendix.a"/><xref derivedContent="" format="title" sectionFormat="of" target="name-acknowledgements">Acknowledgements</xref></t>
          </li>
          <li pn="section-toc.1-1.11">
            <t indent="0" pn="section-toc.1-1.11.1"><xref derivedContent="" format="none" sectionFormat="of" target="section-appendix.b"/><xref derivedContent="" format="title" sectionFormat="of" target="name-authors-addresses">Authors' Addresses</xref></t>
          </li>
        </ul>
      </section>
    </toc>
  </front>
  <middle>
    <section anchor="intro" numbered="true" removeInRFC="false" toc="include" pn="section-1">
      <name slugifiedName="name-introduction">Introduction</name>
      <t indent="0" pn="section-1-1">This specification describes an interoperable regular expression (abbreviated as "regexp") flavor, I-Regexp.</t>
      <t indent="0" pn="section-1-2">I-Regexp does not provide advanced regular expression features such as capture groups, lookahead, or backreferences.
It supports only a Boolean matching capability, i.e., testing whether a given regular expression matches a given piece of text.</t>
      <t indent="0" pn="section-1-3">I-Regexp supports the entire repertoire of Unicode characters (Unicode
scalar values); both the I-Regexp strings themselves and the strings
they are matched against are sequences of Unicode scalar values (often
represented in UTF-8 encoding form <xref target="STD63" format="default" sectionFormat="of" derivedContent="STD63"/> for interchange).</t>
      <t indent="0" pn="section-1-4">I-Regexp is a subset of XML Schema Definition (XSD) regular expressions <xref target="XSD-2" format="default" sectionFormat="of" derivedContent="XSD-2"/>.</t>
      <t indent="0" pn="section-1-5">This document includes guidance for converting I-Regexps for use with several well-known regular expression idioms.</t>
      <t indent="0" pn="section-1-6">The development of I-Regexp was motivated by the work of the JSONPath Working Group (WG). The WG wanted to include support for the use of regular expressions in JSONPath filters 
in its specification <xref target="I-D.ietf-jsonpath-base" format="default" sectionFormat="of" derivedContent="JSONPATH-BASE"/>, but was unable to find a useful
specification for regular expressions that would be interoperable across the popular libraries.</t>
      <section anchor="terminology" numbered="true" removeInRFC="false" toc="include" pn="section-1.1">
        <name slugifiedName="name-terminology">Terminology</name>
        <t indent="0" pn="section-1.1-1">This document uses the abbreviation "regexp" for what is usually
called a "regular expression" in programming.
The term "I-Regexp" is used as a noun meaning a character string (sequence of
Unicode scalar values) that conforms to the requirements
in this specification; the plural is "I-Regexps".</t>
        <t indent="0" pn="section-1.1-2">This specification uses Unicode terminology; a good entry point is provided by <xref target="UNICODE-GLOSSARY" format="default" sectionFormat="of" derivedContent="UNICODE-GLOSSARY"/>.</t>
        <t indent="0" pn="section-1.1-3">
    The key words "<bcp14>MUST</bcp14>", "<bcp14>MUST NOT</bcp14>", "<bcp14>REQUIRED</bcp14>", "<bcp14>SHALL</bcp14>", "<bcp14>SHALL NOT</bcp14>", "<bcp14>SHOULD</bcp14>", "<bcp14>SHOULD NOT</bcp14>", "<bcp14>RECOMMENDED</bcp14>", "<bcp14>NOT RECOMMENDED</bcp14>",
    "<bcp14>MAY</bcp14>", and "<bcp14>OPTIONAL</bcp14>" in this document are to be interpreted as
    described in BCP 14 <xref target="RFC2119" format="default" sectionFormat="of" derivedContent="RFC2119"/> <xref target="RFC8174" format="default" sectionFormat="of" derivedContent="RFC8174"/> 
    when, and only when, they appear in all capitals, as shown here.
        </t>
        <t indent="0" pn="section-1.1-4">The grammatical rules in this document are to be interpreted as ABNF,
as described in <xref target="RFC5234" format="default" sectionFormat="of" derivedContent="RFC5234"/> and <xref target="RFC7405" format="default" sectionFormat="of" derivedContent="RFC7405"/>, where the "characters" of
<xref section="2.3" sectionFormat="of" target="RFC5234" format="default" derivedLink="https://rfc-editor.org/rfc/rfc5234#section-2.3" derivedContent="RFC5234"/> are Unicode scalar values.</t>
      </section>
    </section>
    <section anchor="objectives" numbered="true" removeInRFC="false" toc="include" pn="section-2">
      <name slugifiedName="name-objectives">Objectives</name>
      <t indent="0" pn="section-2-1">I-Regexps should handle the vast majority of practical cases where a
matching regexp is needed in a data-model specification or a query-language expression.</t>
      <t indent="0" pn="section-2-2">At the time of writing, an editor of this document conducted a survey of the regexp syntax
used in recently published RFCs. All examples found there should be covered by I-Regexps,
both syntactically and with their intended semantics.
The exception is the use of multi-character escapes, for which
workaround guidance is provided in <xref target="mapping" format="default" sectionFormat="of" derivedContent="Section 5"/>.</t>
    </section>
    <section anchor="defn" numbered="true" removeInRFC="false" toc="include" pn="section-3">
      <name slugifiedName="name-i-regexp-syntax">I-Regexp Syntax</name>
      <t indent="0" pn="section-3-1">An I-Regexp <bcp14>MUST</bcp14> conform to the ABNF specification in
<xref target="iregexp-abnf" format="default" sectionFormat="of" derivedContent="Figure 1"/>.</t>
      <figure anchor="iregexp-abnf" align="left" suppress-title="false" pn="figure-1">
        <name slugifiedName="name-i-regexp-syntax-in-abnf">I-Regexp Syntax in ABNF</name>
        <sourcecode type="abnf" markers="false" pn="section-3-2.1">
i-regexp = branch *( "|" branch )
branch = *piece
piece = atom [ quantifier ]
quantifier = ( "*" / "+" / "?" ) / range-quantifier
range-quantifier = "{" QuantExact [ "," [ QuantExact ] ] "}"
QuantExact = 1*%x30-39 ; '0'-'9'

atom = NormalChar / charClass / ( "(" i-regexp ")" )
NormalChar = ( %x00-27 / "," / "-" / %x2F-3E ; '/'-'&gt;'
 / %x40-5A ; '@'-'Z'
 / %x5E-7A ; '^'-'z'
 / %x7E-D7FF ; skip surrogate code points
 / %xE000-10FFFF )
charClass = "." / SingleCharEsc / charClassEsc / charClassExpr
SingleCharEsc = "\" ( %x28-2B ; '('-'+'
 / "-" / "." / "?" / %x5B-5E ; '['-'^'
 / %s"n" / %s"r" / %s"t" / %x7B-7D ; '{'-'}'
 )
charClassEsc = catEsc / complEsc
charClassExpr = "[" [ "^" ] ( "-" / CCE1 ) *CCE1 [ "-" ] "]"
CCE1 = ( CCchar [ "-" CCchar ] ) / charClassEsc
CCchar = ( %x00-2C / %x2E-5A ; '.'-'Z'
 / %x5E-D7FF ; skip surrogate code points
 / %xE000-10FFFF ) / SingleCharEsc
catEsc = %s"\p{" charProp "}"
complEsc = %s"\P{" charProp "}"
charProp = IsCategory
IsCategory = Letters / Marks / Numbers / Punctuation / Separators /
    Symbols / Others
Letters = %s"L" [ ( %s"l" / %s"m" / %s"o" / %s"t" / %s"u" ) ]
Marks = %s"M" [ ( %s"c" / %s"e" / %s"n" ) ]
Numbers = %s"N" [ ( %s"d" / %s"l" / %s"o" ) ]
Punctuation = %s"P" [ ( %x63-66 ; 'c'-'f'
 / %s"i" / %s"o" / %s"s" ) ]
Separators = %s"Z" [ ( %s"l" / %s"p" / %s"s" ) ]
Symbols = %s"S" [ ( %s"c" / %s"k" / %s"m" / %s"o" ) ]
Others = %s"C" [ ( %s"c" / %s"f" / %s"n" / %s"o" ) ]
</sourcecode>
      </figure>
      <t indent="0" pn="section-3-3">As an additional restriction, <tt>charClassExpr</tt> is not allowed to
match <tt>[^]</tt>, which, according to this grammar, would parse as a
positive character class containing the single character <tt>^</tt>.</t>
      <t indent="0" pn="section-3-4">This is essentially an XSD regexp without:</t>
      <ul bare="false" empty="false" indent="3" spacing="normal" pn="section-3-5">
        <li pn="section-3-5.1">character class
subtraction,</li>
        <li pn="section-3-5.2">multi-character escapes such as <tt>\s</tt>,
<tt>\S</tt>, and <tt>\w</tt>, and </li>
        <li pn="section-3-5.3">Unicode blocks.</li>
      </ul>
      <t indent="0" pn="section-3-6">An I-Regexp implementation <bcp14>MUST</bcp14> be a complete implementation of this
limited subset.
In particular, full support for the Unicode functionality defined in
      this specification is <bcp14>REQUIRED</bcp14>. The implementation:</t>
      <ul bare="false" empty="false" indent="3" spacing="normal" pn="section-3-7">
        <li pn="section-3-7.1">
          <bcp14>MUST NOT</bcp14> limit itself to 7- or 8-bit character sets such as ASCII, and</li>
        <li pn="section-3-7.2">
          <bcp14>MUST</bcp14> support the Unicode character property set in character classes.</li>
      </ul>
      <section anchor="checking" numbered="true" removeInRFC="false" toc="include" pn="section-3.1">
        <name slugifiedName="name-checking-implementations">Checking Implementations</name>
        <t indent="0" pn="section-3.1-1">A <em>checking</em> I-Regexp implementation is one that checks a supplied
regexp for compliance with this specification and reports any problems.
Checking implementations give their users confidence that they didn't
accidentally insert syntax that is not interoperable, so checking is <bcp14>RECOMMENDED</bcp14>.
Exceptions to this rule may be made for low-effort implementations
that map I-Regexp to another regexp library by simple steps such as
performing the mapping operations discussed in <xref target="mapping" format="default" sectionFormat="of" derivedContent="Section 5"/>.  Here, the
effort needed to do full checking might dwarf the rest of the
implementation effort.
Implementations <bcp14>SHOULD</bcp14> document whether or not they are checking.</t>
        <t indent="0" pn="section-3.1-2">Specifications that employ I-Regexp may want to define in which
cases their implementations can work with a non-checking I-Regexp
implementation and when full checking is needed, possibly in the
process of defining their own implementation classes.</t>
      </section>
    </section>
    <section anchor="i-regexp-semantics" numbered="true" removeInRFC="false" toc="include" pn="section-4">
      <name slugifiedName="name-i-regexp-semantics">I-Regexp Semantics</name>
      <t indent="0" pn="section-4-1">This syntax is a subset of that of <xref target="XSD-2" format="default" sectionFormat="of" derivedContent="XSD-2"/>.
Implementations that interpret I-Regexps <bcp14>MUST</bcp14>
yield Boolean results as specified in <xref target="XSD-2" format="default" sectionFormat="of" derivedContent="XSD-2"/>.
(See also <xref target="xsd-regexps" format="default" sectionFormat="of" derivedContent="Section 5.2"/>.)</t>
    </section>
    <section anchor="mapping" numbered="true" removeInRFC="false" toc="include" pn="section-5">
      <name slugifiedName="name-mapping-i-regexp-to-regexp-">Mapping I-Regexp to Regexp Dialects</name>
      <t indent="0" pn="section-5-1">The material in this section is not normative; it is provided as guidance
to developers who want to use I-Regexps in the context of other
regular expression dialects.</t>
      <section anchor="multi-character-escapes" numbered="true" removeInRFC="false" toc="include" pn="section-5.1">
        <name slugifiedName="name-multi-character-escapes">Multi-Character Escapes</name>
        <t indent="0" pn="section-5.1-1">I-Regexp does not support common multi-character escapes (MCEs) and character classes built around them.  These can usually
be replaced as shown by the examples in <xref target="tbl-sub" format="default" sectionFormat="of" derivedContent="Table 1"/>.</t>
        <table anchor="tbl-sub" align="center" pn="table-1">
          <name slugifiedName="name-example-substitutes-for-mul">Example Substitutes for Multi-Character Escapes</name>
          <thead>
            <tr>
              <th align="left" colspan="1" rowspan="1">MCE/class:</th>
              <th align="left" colspan="1" rowspan="1">Replace with:</th>
            </tr>
          </thead>
          <tbody>
            <tr>
              <td align="left" colspan="1" rowspan="1">
                <tt>\S</tt></td>
              <td align="left" colspan="1" rowspan="1">
                <tt>[^ \t\n\r]</tt></td>
            </tr>
            <tr>
              <td align="left" colspan="1" rowspan="1">
                <tt>[\S ]</tt></td>
              <td align="left" colspan="1" rowspan="1">
                <tt>[^\t\n\r]</tt></td>
            </tr>
            <tr>
              <td align="left" colspan="1" rowspan="1">
                <tt>\d</tt></td>
              <td align="left" colspan="1" rowspan="1">
                <tt>[0-9]</tt></td>
            </tr>
          </tbody>
        </table>
        <t indent="0" pn="section-5.1-3">Note that the semantics of <tt>\d</tt> in XSD regular expressions
        is that of <tt>\p{Nd}</tt>; however, this would include all Unicode
        characters that are digits in various writing systems, which is almost
        certainly not what is required in IETF publications.</t>
        <t indent="0" pn="section-5.1-4">The construct <tt>\p{IsBasicLatin}</tt> is essentially a reference to legacy
ASCII; it can be replaced by the character class <tt>[\u0000-\u007f]</tt>.</t>
      </section>
      <section anchor="xsd-regexps" numbered="true" removeInRFC="false" toc="include" pn="section-5.2">
        <name slugifiedName="name-xsd-regexps">XSD Regexps</name>
        <t indent="0" pn="section-5.2-1">Any I-Regexp is also an XSD regexp <xref target="XSD-2" format="default" sectionFormat="of" derivedContent="XSD-2"/>, so the mapping is an identity
	function.</t>
        <t indent="0" pn="section-5.2-2">Note that a few errata for <xref target="XSD-2" format="default" sectionFormat="of" derivedContent="XSD-2"/> have been fixed in <xref target="XSD-1.1-2" format="default" sectionFormat="of" derivedContent="XSD-1.1-2"/>; therefore, it
is also included in the <xref target="norm" format="default" sectionFormat="of" derivedContent="Section 9.1">Normative References</xref>.
XSD 1.1 is less widely implemented than XSD 1.0, and implementations
of XSD 1.0 are likely to include these bugfixes; for the intents
and purposes of this specification, an implementation of XSD 1.0
regexps is equivalent to an implementation of XSD 1.1 regexps.</t>
      </section>
      <section anchor="toESreg" numbered="true" removeInRFC="false" toc="include" pn="section-5.3">
        <name slugifiedName="name-ecmascript-regexps">ECMAScript Regexps</name>
        <t indent="0" pn="section-5.3-1">Perform the following steps on an I-Regexp to obtain an ECMAScript
regexp <xref target="ECMA-262" format="default" sectionFormat="of" derivedContent="ECMA-262"/>:</t>
        <ul spacing="normal" bare="false" empty="false" indent="3" pn="section-5.3-2">
          <li pn="section-5.3-2.1">For any unescaped dots (<tt>.</tt>) outside character classes
          (first alternative of <tt>charClass</tt> production), replace the dot with
          <tt>[^\n\r]</tt>.</li>
          <li pn="section-5.3-2.2">Envelope the result in <tt>^(?:</tt> and <tt>)$</tt>.</li>
        </ul>
        <t indent="0" pn="section-5.3-3">The ECMAScript regexp is to be interpreted as a Unicode pattern ("u"
flag; see Section 21.2.2 "Pattern Semantics" of <xref target="ECMA-262" format="default" sectionFormat="of" derivedContent="ECMA-262"/>).</t>
        <t indent="0" pn="section-5.3-4">Note that where a regexp literal is required,
the actual regexp needs to be enclosed in <tt>/</tt>.</t>
      </section>
      <section anchor="pcre-re2-ruby-regexps" numbered="true" removeInRFC="false" toc="include" pn="section-5.4">
        <name slugifiedName="name-pcre-re2-and-ruby-regexps">PCRE, RE2, and Ruby Regexps</name>
        <t indent="0" pn="section-5.4-1">To obtain a valid regexp in Perl Compatible Regular Expressions
   (PCRE) <xref target="PCRE2" format="default" sectionFormat="of" derivedContent="PCRE2"/>, the Go programming language's RE2 regexp library <xref target="RE2" format="default" sectionFormat="of" derivedContent="RE2"/>, and the Ruby
programming language, perform the same steps as in <xref target="toESreg" format="default" sectionFormat="of" derivedContent="Section 5.3"/>, except that the last step is:</t>
        <ul spacing="normal" bare="false" empty="false" indent="3" pn="section-5.4-2">
          <li pn="section-5.4-2.1">Enclose the regexp in <tt>\A(?:</tt> and <tt>)\z</tt>.</li>
        </ul>
      </section>
    </section>
    <section anchor="background" numbered="true" removeInRFC="false" toc="include" pn="section-6">
      <name slugifiedName="name-motivation-and-background">Motivation and Background</name>
      <t indent="0" pn="section-6-1">While regular expressions originally were intended to describe a
formal language to support a Boolean matching function, they
have been enhanced with parsing functions that support the extraction
and replacement of arbitrary portions of the matched text. With this
accretion of features, parsing-regexp libraries have become
more susceptible to bugs and surprising performance degradations that
can be exploited in denial-of-service attacks by
an attacker who controls the regexp submitted for
processing. I-Regexp is designed to offer interoperability and to be
less vulnerable to such attacks, with the trade-off that its only
function is to offer a Boolean response as to whether a character
sequence is matched by a regexp.</t>
      <section anchor="subsetting" numbered="true" removeInRFC="false" toc="include" pn="section-6.1">
        <name slugifiedName="name-implementing-i-regexp">Implementing I-Regexp</name>
        <t indent="0" pn="section-6.1-1">XSD regexps are relatively easy to implement or map to widely
implemented parsing-regexp dialects, with these notable
exceptions:</t>
        <ul spacing="normal" bare="false" empty="false" indent="3" pn="section-6.1-2">
          <li pn="section-6.1-2.1">Character class subtraction.  This is a very useful feature in
          many specifications, but it is unfortunately mostly absent from
          parsing-regexp dialects. Thus, it is omitted from I-Regexp.</li>
          <li pn="section-6.1-2.2">Multi-character escapes.  <tt>\d</tt>, <tt>\w</tt>, <tt>\s</tt>
          and their uppercase complement classes exhibit a large amount of
          variation between regexp flavors.  Thus, they are omitted from
          I-Regexp.</li>
          <li pn="section-6.1-2.3">Not all regexp implementations support access to Unicode
          tables that enable executing constructs such as <tt>\p{Nd}</tt>,
          although the <tt>\p</tt>/<tt>\P</tt> feature in general is now quite
          widely available. While, in principle, it is possible to translate
          these into character-class matches, this also requires access to
          those tables. Thus, regexp libraries in severely constrained
          environments may not be able to support I-Regexp conformance.</li>
        </ul>
      </section>
    </section>
    <section anchor="iana-considerations" numbered="true" removeInRFC="false" toc="include" pn="section-7">
      <name slugifiedName="name-iana-considerations">IANA Considerations</name>
      <t indent="0" pn="section-7-1">This document has no IANA actions.</t>
    </section>
    <section anchor="security-considerations" numbered="true" removeInRFC="false" toc="include" pn="section-8">
      <name slugifiedName="name-security-considerations">Security Considerations</name>
      <t indent="0" pn="section-8-1">While technically out of the scope of this specification, Section <xref target="RFC3629" section="10" sectionFormat="bare" format="default" derivedLink="https://rfc-editor.org/rfc/rfc3629#section-10" derivedContent="RFC3629"> "Security Considerations"</xref> of RFC 3629 <xref target="STD63" format="default" sectionFormat="of" derivedContent="STD63"/> applies to implementations.
Particular note needs to be taken of the last paragraph of Section <xref target="RFC3629" section="3" sectionFormat="bare" format="default" derivedLink="https://rfc-editor.org/rfc/rfc3629#section-3" derivedContent="RFC3629"> "UTF-8 definition" </xref> of RFC 3629 <xref target="STD63" format="default" sectionFormat="of" derivedContent="STD63"/>; an I-Regexp implementation may need to
mitigate limitations of the platform implementation in this regard.</t>
      <t indent="0" pn="section-8-2">As discussed in <xref target="background" format="default" sectionFormat="of" derivedContent="Section 6"/>, more complex regexp libraries may
contain exploitable bugs, which can lead to crashes and remote code
execution.  There is also the problem that such libraries often have
performance characteristics that are hard to predict, leading to attacks
that overload an implementation by matching against an expensive
attacker-controlled regexp.</t>
      <t indent="0" pn="section-8-3">I-Regexps have been designed to allow implementation in a way that is
resilient to both threats; this objective needs to be addressed
throughout the implementation effort. Non-checking implementations (see <xref target="checking" format="default" sectionFormat="of" derivedContent="Section 3.1"/>) are likely to expose
security limitations of any regexp engine they use, which may be less
problematic if that engine has been built with security considerations
in mind (e.g., <xref target="RE2" format="default" sectionFormat="of" derivedContent="RE2"/>).  In any case, a checking implementation is still <bcp14>RECOMMENDED</bcp14>.</t>
      <t indent="0" pn="section-8-4">Implementations that specifically implement the I-Regexp subset can,
with care, be designed to generally run in linear time and space in
the input and to detect when that would not be the case (see below).</t>
      <t indent="0" pn="section-8-5">Existing regexp engines should be able to easily handle most I-Regexps
(after the adjustments discussed in <xref target="mapping" format="default" sectionFormat="of" derivedContent="Section 5"/>), but may consume
excessive resources for some types of I-Regexps or outright reject
them because they cannot guarantee efficient execution.
(Note that different versions of the same regexp library may be more or
      less vulnerable to excessive resource consumption for these cases.)</t>
      <t indent="0" pn="section-8-6">Specifically, range quantifiers (as in <tt>a{2,4}</tt>) provide particular
challenges for both existing and I-Regexp focused implementations.
Implementations may therefore limit range quantifiers in composability
(disallowing nested range quantifiers such as <tt>(a{2,4}){2,4}</tt>) or
range (disallowing very large ranges such as <tt>a{20,200000}</tt>), or detect
and reject any excessive resource consumption caused by range quantifiers.</t>
      <t indent="0" pn="section-8-7">I-Regexp implementations that are used to evaluate regexps from
untrusted sources need to be robust in these cases.
Implementers using existing regexp libraries are encouraged:</t>
      <ul bare="false" empty="false" indent="3" spacing="normal" pn="section-8-8">
        <li pn="section-8-8.1">to check
their documentation to see if mitigations are configurable, such as
  limits in resource consumption, and</li>
        <li pn="section-8-8.2">to document their own degree of
robustness resulting from employing such mitigations.</li>
      </ul>
    </section>
  </middle>
  <back>
    <displayreference target="I-D.ietf-jsonpath-base" to="JSONPATH-BASE"/>
    <references pn="section-9">
      <name slugifiedName="name-references">References</name>
      <references anchor="norm" pn="section-9.1">
        <name slugifiedName="name-normative-references">Normative References</name>
        <reference anchor="RFC2119" target="https://www.rfc-editor.org/info/rfc2119" quoteTitle="true" derivedAnchor="RFC2119">
          <front>
            <title>Key words for use in RFCs to Indicate Requirement Levels</title>
            <author fullname="S. Bradner" initials="S." surname="Bradner"/>
            <date month="March" year="1997"/>
            <abstract>
              <t indent="0">In many standards track documents several words are used to signify the requirements in the specification. These words are often capitalized. This document defines these words as they should be interpreted in IETF documents. This document specifies an Internet Best Current Practices for the Internet Community, and requests discussion and suggestions for improvements.</t>
            </abstract>
          </front>
          <seriesInfo name="BCP" value="14"/>
          <seriesInfo name="RFC" value="2119"/>
          <seriesInfo name="DOI" value="10.17487/RFC2119"/>
        </reference>
        <reference anchor="RFC5234" target="https://www.rfc-editor.org/info/rfc5234" quoteTitle="true" derivedAnchor="RFC5234">
          <front>
            <title>Augmented BNF for Syntax Specifications: ABNF</title>
            <author fullname="D. Crocker" initials="D." role="editor" surname="Crocker"/>
            <author fullname="P. Overell" initials="P." surname="Overell"/>
            <date month="January" year="2008"/>
            <abstract>
              <t indent="0">Internet technical specifications often need to define a formal syntax. Over the years, a modified version of Backus-Naur Form (BNF), called Augmented BNF (ABNF), has been popular among many Internet specifications. The current specification documents ABNF. It balances compactness and simplicity with reasonable representational power. The differences between standard BNF and ABNF involve naming rules, repetition, alternatives, order-independence, and value ranges. This specification also supplies additional rule definitions and encoding for a core lexical analyzer of the type common to several Internet specifications. [STANDARDS-TRACK]</t>
            </abstract>
          </front>
          <seriesInfo name="STD" value="68"/>
          <seriesInfo name="RFC" value="5234"/>
          <seriesInfo name="DOI" value="10.17487/RFC5234"/>
        </reference>
        <reference anchor="RFC7405" target="https://www.rfc-editor.org/info/rfc7405" quoteTitle="true" derivedAnchor="RFC7405">
          <front>
            <title>Case-Sensitive String Support in ABNF</title>
            <author fullname="P. Kyzivat" initials="P." surname="Kyzivat"/>
            <date month="December" year="2014"/>
            <abstract>
              <t indent="0">This document extends the base definition of ABNF (Augmented Backus-Naur Form) to include a way to specify US-ASCII string literals that are matched in a case-sensitive manner.</t>
            </abstract>
          </front>
          <seriesInfo name="RFC" value="7405"/>
          <seriesInfo name="DOI" value="10.17487/RFC7405"/>
        </reference>
        <reference anchor="RFC8174" target="https://www.rfc-editor.org/info/rfc8174" quoteTitle="true" derivedAnchor="RFC8174">
          <front>
            <title>Ambiguity of Uppercase vs Lowercase in RFC 2119 Key Words</title>
            <author fullname="B. Leiba" initials="B." surname="Leiba"/>
            <date month="May" year="2017"/>
            <abstract>
              <t indent="0">RFC 2119 specifies common key words that may be used in protocol specifications. This document aims to reduce the ambiguity by clarifying that only UPPERCASE usage of the key words have the defined special meanings.</t>
            </abstract>
          </front>
          <seriesInfo name="BCP" value="14"/>
          <seriesInfo name="RFC" value="8174"/>
          <seriesInfo name="DOI" value="10.17487/RFC8174"/>
        </reference>
        <reference anchor="XSD-1.1-2" target="https://www.w3.org/TR/2012/REC-xmlschema11-2-20120405/" quoteTitle="true" derivedAnchor="XSD-1.1-2">
          <front>
            <title>W3C XML Schema Definition Language (XSD) 1.1 Part 2: Datatypes</title>
            <author fullname="David Peterson" role="editor"/>
            <author fullname="Shudi Gao" role="editor"/>
            <author fullname="Ashok Malhotra" role="editor"/>
            <author fullname="C. M. Sperberg-McQueen" role="editor"/>
            <author fullname="Henry Thompson" role="editor"/>
            <author fullname="Paul Biron" role="editor"/>
            <date day="5" month="April" year="2012"/>
          </front>
          <seriesInfo name="W3C REC" value="REC-xmlschema11-2-20120405"/>
          <seriesInfo name="W3C" value="REC-xmlschema11-2-20120405"/>
        </reference>
        <reference anchor="XSD-2" target="https://www.w3.org/TR/2004/REC-xmlschema-2-20041028/" quoteTitle="true" derivedAnchor="XSD-2">
          <front>
            <title>XML Schema Part 2: Datatypes Second Edition</title>
            <author fullname="Paul Biron" role="editor"/>
            <author fullname="Ashok Malhotra" role="editor"/>
            <date day="28" month="October" year="2004"/>
          </front>
          <seriesInfo name="W3C REC" value="REC-xmlschema-2-20041028"/>
          <seriesInfo name="W3C" value="REC-xmlschema-2-20041028"/>
        </reference>
      </references>
      <references pn="section-9.2">
        <name slugifiedName="name-informative-references">Informative References</name>
        <reference anchor="ECMA-262" target="https://www.ecma-international.org/wp-content/uploads/ECMA-262.pdf" quoteTitle="true" derivedAnchor="ECMA-262">
          <front>
            <title>ECMAScript 2020 Language Specification</title>
            <author>
              <organization showOnFrontPage="true">Ecma International</organization>
            </author>
            <date year="2020" month="June"/>
          </front>
          <refcontent>Standard ECMA-262, 11th Edition</refcontent>
        </reference>
        <reference anchor="I-D.ietf-jsonpath-base" target="https://datatracker.ietf.org/doc/html/draft-ietf-jsonpath-base-20" quoteTitle="true" derivedAnchor="JSONPATH-BASE">
          <front>
            <title>JSONPath: Query expressions for JSON</title>
            <author initials="S." surname="Gössner" fullname="Stefan Gössner" role="editor">
              <organization showOnFrontPage="true">Fachhochschule Dortmund</organization>
            </author>
            <author initials="G." surname="Normington" fullname="Glyn Normington" role="editor"> 
</author>
            <author initials="C." surname="Bormann" fullname="Carsten Bormann" role="editor">
              <organization showOnFrontPage="true">Universität Bremen TZI</organization>
            </author>
            <date month="August" day="25" year="2023"/>
          </front>
          <seriesInfo name="Internet-Draft" value="draft-ietf-jsonpath-base-20"/>
          <refcontent>Work in Progress</refcontent>
        </reference>
        <reference anchor="PCRE2" target="http://pcre.org/current/doc/html/" quoteTitle="true" derivedAnchor="PCRE2">
          <front>
            <title>Perl-compatible Regular Expressions (revised API: PCRE2)</title>
            <author>
              <organization showOnFrontPage="true"/>
            </author>
          </front>
        </reference>
        <reference anchor="RE2" target="https://github.com/google/re2" quoteTitle="true" derivedAnchor="RE2">
          <front>
            <title>RE2 is a fast, safe, thread-friendly alternative to backtracking regular expression engines like those used in PCRE, Perl, and Python. It is a C++ library.</title>
            <author>
              <organization showOnFrontPage="true"/>
            </author>
          </front>
          <refcontent>commit 73031bb</refcontent>
        </reference>
        <reference anchor="RFC7493" target="https://www.rfc-editor.org/info/rfc7493" quoteTitle="true" derivedAnchor="RFC7493">
          <front>
            <title>The I-JSON Message Format</title>
            <author fullname="T. Bray" initials="T." role="editor" surname="Bray"/>
            <date month="March" year="2015"/>
            <abstract>
              <t indent="0">I-JSON (short for "Internet JSON") is a restricted profile of JSON designed to maximize interoperability and increase confidence that software can process it successfully with predictable results.</t>
            </abstract>
          </front>
          <seriesInfo name="RFC" value="7493"/>
          <seriesInfo name="DOI" value="10.17487/RFC7493"/>
        </reference>
        <referencegroup anchor="STD63" target="https://www.rfc-editor.org/info/std63" derivedAnchor="STD63">
          <reference anchor="RFC3629" target="https://www.rfc-editor.org/info/rfc3629" quoteTitle="true">
            <front>
              <title>UTF-8, a transformation format of ISO 10646</title>
              <author fullname="F. Yergeau" initials="F." surname="Yergeau"/>
              <date month="November" year="2003"/>
              <abstract>
                <t indent="0">ISO/IEC 10646-1 defines a large character set called the Universal Character Set (UCS) which encompasses most of the world's writing systems. The originally proposed encodings of the UCS, however, were not compatible with many current applications and protocols, and this has led to the development of UTF-8, the object of this memo. UTF-8 has the characteristic of preserving the full US-ASCII range, providing compatibility with file systems, parsers and other software that rely on US-ASCII values but are transparent to other values. This memo obsoletes and replaces RFC 2279.</t>
              </abstract>
            </front>
            <seriesInfo name="STD" value="63"/>
            <seriesInfo name="RFC" value="3629"/>
            <seriesInfo name="DOI" value="10.17487/RFC3629"/>
          </reference>
        </referencegroup>
        <reference anchor="UNICODE-GLOSSARY" target="https://unicode.org/glossary/" quoteTitle="true" derivedAnchor="UNICODE-GLOSSARY">
          <front>
            <title>Glossary of Unicode Terms</title>
            <author>
              <organization showOnFrontPage="true">Unicode, Inc.</organization>
            </author>
            <date/>
          </front>
        </reference>
      </references>
    </references>
    <section numbered="false" anchor="acknowledgements" removeInRFC="false" toc="include" pn="section-appendix.a">
      <name slugifiedName="name-acknowledgements">Acknowledgements</name>
      <t indent="0" pn="section-appendix.a-1">Discussion in the IETF
      JSONPATH WG about whether to include a regexp mechanism into the
      JSONPath query expression specification and previous
      discussions about the YANG <tt>pattern</tt> and Concise Data
   Definition Language (CDDL) <tt>.regexp</tt>
      features motivated this specification.</t>
      <t indent="0" pn="section-appendix.a-2">The basic approach for this specification was inspired by "<xref target="RFC7493" format="title" sectionFormat="of" derivedContent="The I-JSON Message Format"/>" <xref target="RFC7493" format="default" sectionFormat="of" derivedContent="RFC7493"/>.</t>
    </section>
    <section anchor="authors-addresses" numbered="false" removeInRFC="false" toc="include" pn="section-appendix.b">
      <name slugifiedName="name-authors-addresses">Authors' Addresses</name>
      <author initials="C." surname="Bormann" fullname="Carsten Bormann">
        <organization showOnFrontPage="true">Universität Bremen TZI</organization>
        <address>
          <postal>
            <street>Postfach 330440</street>
            <city>Bremen</city>
            <code>D-28359</code>
            <country>Germany</country>
          </postal>
          <phone>+49-421-218-63921</phone>
          <email>cabo@tzi.org</email>
        </address>
      </author>
      <author initials="T." surname="Bray" fullname="Tim Bray">
        <organization showOnFrontPage="true">Textuality</organization>
        <address>
          <postal>
            <country>Canada</country>
          </postal>
          <email>tbray@textuality.com</email>
        </address>
      </author>
    </section>
  </back>
</rfc>
