% \iffalse meta-comment
%
%% File: l3str-convert.dtx
%
% Copyright (C) 2013-2025 The LaTeX Project
%
% It may be distributed and/or modified under the conditions of the
% LaTeX Project Public License (LPPL), either version 1.3c of this
% license or (at your option) any later version.  The latest version
% of this license is in the file
%
%    https://www.latex-project.org/lppl.txt
%
% This file is part of the "l3kernel bundle" (The Work in LPPL)
% and all files in that bundle must be distributed together.
%
% -----------------------------------------------------------------------
%
% The development version of the bundle can be found at
%
%    https://github.com/latex3/latex3
%
% for those people who are interested.
%
%<*driver>
\documentclass[full,kernel]{l3doc}
\begin{document}
  \DocInput{\jobname.dtx}
\end{document}
%</driver>
% \fi
%
%
% \title{^^A
%   The \pkg{l3str-convert} module\\ String encoding conversions^^A
% }
%
% \author{^^A
%  The \LaTeX{} Project\thanks
%    {^^A
%      E-mail:
%        \href{mailto:latex-team@latex-project.org}
%          {latex-team@latex-project.org}^^A
%    }^^A
% }
%
% \date{Released 2025-01-18}
%
% \maketitle
%
% \begin{documentation}
%
% \section{Encoding and escaping schemes}
%
% Traditionally, string encodings only specify how strings of characters
% should be stored as bytes. However, the resulting lists of bytes are
% often to be used in contexts where only a restricted subset of bytes
% are permitted (\emph{e.g.}, \textsc{pdf} string objects,
% \textsc{url}s).  Hence, storing a string of characters is done in two
% steps.
% \begin{itemize}
%   \item The code points (\enquote{character codes}) are expressed as
%     bytes following a given \enquote{encoding}. This can be
%     \textsc{utf-16}, \textsc{iso 8859-1}, \emph{etc.}  See
%     Table~\ref{tab:encodings} for a list of supported
%     encodings.\footnote{Encodings and escapings will be added as they
%       are requested.}
%   \item Bytes are translated to \TeX{} tokens through a given
%     \enquote{escaping}. Those are defined for the most part by the
%     \texttt{pdf} file format.  See Table~\ref{tab:escapings} for a
%     list of escaping methods supported.\footnotemark[\csname c@footnote\endcsname]
% \end{itemize}
%
% \begin{table}\centering
%   \caption{\label{tab:encodings}Supported encodings.
%     Non-alphanumeric characters are ignored,
%     and capital letters are lower-cased
%     before searching for the encoding in this list.}
%   \begin{tabular}{cc}
%     \toprule
%     \meta{Encoding}   & description \\
%     \midrule
%     \texttt{utf8}     & \textsc{utf-8} \\
%     \texttt{utf16}    & \textsc{utf-16}, with byte-order mark \\
%     \texttt{utf16be}  & \textsc{utf-16}, big-endian \\
%     \texttt{utf16le}  & \textsc{utf-16}, little-endian \\
%     \texttt{utf32}    & \textsc{utf-32}, with byte-order mark \\
%     \texttt{utf32be}  & \textsc{utf-32}, big-endian \\
%     \texttt{utf32le}  & \textsc{utf-32}, little-endian \\
%     \midrule
%     \texttt{iso88591},  \texttt{latin1}  & \textsc{iso 8859-1} \\
%     \texttt{iso88592},  \texttt{latin2}  & \textsc{iso 8859-2} \\
%     \texttt{iso88593},  \texttt{latin3}  & \textsc{iso 8859-3} \\
%     \texttt{iso88594},  \texttt{latin4}  & \textsc{iso 8859-4} \\
%     \texttt{iso88595}                    & \textsc{iso 8859-5} \\
%     \texttt{iso88596}                    & \textsc{iso 8859-6} \\
%     \texttt{iso88597}                    & \textsc{iso 8859-7} \\
%     \texttt{iso88598}                    & \textsc{iso 8859-8} \\
%     \texttt{iso88599},  \texttt{latin5}  & \textsc{iso 8859-9} \\
%     \texttt{iso885910}, \texttt{latin6}  & \textsc{iso 8859-10} \\
%     \texttt{iso885911}                   & \textsc{iso 8859-11} \\
%     \texttt{iso885913}, \texttt{latin7}  & \textsc{iso 8859-13} \\
%     \texttt{iso885914}, \texttt{latin8}  & \textsc{iso 8859-14} \\
%     \texttt{iso885915}, \texttt{latin9}  & \textsc{iso 8859-15} \\
%     \texttt{iso885916}, \texttt{latin10} & \textsc{iso 8859-16} \\
%     \midrule
%     \texttt{clist}                       & comma-list of integers \\
%     \meta{empty}                         & native (Unicode) string \\
%     \texttt{default}                        & like \texttt{utf8} with 8-bit engines,
%                                            and like native with unicode-engines \\
%     \bottomrule
%   \end{tabular}
% \end{table}
%
% \begin{table}\centering
%   \caption{\label{tab:escapings}Supported escapings.
%     Non-alphanumeric characters are ignored,
%     and capital letters are lower-cased
%     before searching for the escaping in this list.}
%   \begin{tabular}{cc}
%     \toprule
%     \meta{Escaping} & description \\
%     \midrule
%     \texttt{bytes}, or empty
%       & arbitrary bytes \\
%     \texttt{hex}, \texttt{hexadecimal}
%       & byte $=$ two hexadecimal digits \\
%     \texttt{name}
%       & see \tn{pdfescapename} \\
%     \texttt{string}
%       & see \tn{pdfescapestring} \\
%     \texttt{url}
%       & encoding used in \textsc{url}s \\
%     \bottomrule
%   \end{tabular}
% \end{table}
%
% \section{Conversion functions}
%
% \begin{function}{\str_set_convert:Nnnn, \str_gset_convert:Nnnn}
%   \begin{syntax}
%     \cs{str_set_convert:Nnnn} \meta{str~var} \Arg{string} \Arg{name_1} \Arg{name_2}
%   \end{syntax}
%   This function converts the \meta{string} from the encoding given by
%   \meta{name_1} to the encoding given by \meta{name_2}, and stores the
%   result in the \meta{str~var}.  Each \meta{name} can have the form
%   \meta{encoding} or \meta{encoding}\texttt{/}\meta{escaping}, where
%   the possible values of \meta{encoding} and \meta{escaping} are given
%   in Tables~\ref{tab:encodings} and~\ref{tab:escapings}, respectively.
%   The default escaping is to input and output bytes directly.  The
%   special case of an empty \meta{name} indicates the use of
%   \enquote{native} strings, 8-bit for \pdfTeX{}, and Unicode strings
%   for the other two engines.
%
%   For example,
%   \begin{verbatim}
%     \str_set_convert:Nnnn \l_foo_str { Hello! } { } { utf16/hex }
%   \end{verbatim}
%   results in the variable \cs[no-index]{l_foo_str} holding the string
%   \texttt{FEFF00480065006C006C006F0021}. This is obtained by
%   converting each character in the (native) string \texttt{Hello!}  to
%   the \textsc{utf-16} encoding, and expressing each byte as a pair of
%   hexadecimal digits. Note the presence of a (big-endian) byte order
%   mark \hexnum{FEFF}, which can be avoided by specifying the encoding
%   \texttt{utf16be/hex}.
%
%   An error is raised if the \meta{string} is not valid according to
%   the \meta{escaping~1} and \meta{encoding~1}, or if it cannot be
%   reencoded in the \meta{encoding~2} and \meta{escaping~2} (for
%   instance, if a character does not exist in the \meta{encoding~2}).
%   Erroneous input is replaced by the Unicode replacement character
%   \hexnum{FFFD}, and characters which cannot be reencoded are replaced
%   by either the replacement character \hexnum{FFFD} if it exists in
%   the \meta{encoding~2}, or an encoding-specific replacement
%   character, or the question mark character.
% \end{function}
%
% \begin{function}[TF]{\str_set_convert:Nnnn, \str_gset_convert:Nnnn}
%   \begin{syntax}
%     \cs{str_set_convert:NnnnTF} \meta{str~var} \Arg{string} \Arg{name_1} \Arg{name_2} \Arg{true code} \Arg{false code}
%   \end{syntax}
%   As \cs{str_set_convert:Nnnn}, converts the \meta{string} from the
%   encoding given by \meta{name_1} to the encoding given by
%   \meta{name_2}, and assigns the result to \meta{str~var}. Contrarily
%   to \cs{str_set_convert:Nnnn}, the conditional variant does not raise
%   errors in case the \meta{string} is not valid according to the
%   \meta{name_1} encoding, or cannot be expressed in the \meta{name_2}
%   encoding. Instead, the \meta{false code} is performed.
% \end{function}
%
% \section{Conversion by expansion (for PDF contexts)}
%
% A small number of expandable functions are provided for use in PDF string/name
% contexts. These \emph{assume UTF-8} and \emph{no escaping} in the input.
%
% \begin{function}[EXP]{\str_convert_pdfname:n}
%   \begin{syntax}
%     \cs{str_convert_pdfname:n} \Arg{string}
%   \end{syntax}
%   As \cs{str_set_convert:Nnnn}, converts the \meta{string} on a byte-by-byte
%   basis with non-ASCII codepoints  escaped using hashes.
% \end{function}
%
% \section{Possibilities, and things to do}
%
% Encoding/escaping-related tasks.
% \begin{itemize}
%   \item In \XeTeX{}/\LuaTeX{}, would it be better to use the
%     |^^^^....| approach to build a string from a given list of
%     character codes?  Namely, within a group, assign |0-9a-f| and all
%     characters we want to category ``other'', then assign~|^| the
%     category superscript, and use \tn{scantokens}.
%   \item Change \cs{str_set_convert:Nnnn} to expand its last two
%     arguments.
%   \item Describe the internal format in the code comments. Refuse code
%     points in $[\hexnum{D800}, \hexnum{DFFF}]$ in the internal
%     representation?
%   \item Add documentation about each encoding and escaping method, and
%     add examples.
%   \item The \texttt{hex} unescaping should raise an error for
%     odd-token count strings.
%   \item Decide what bytes should be escaped in the \texttt{url}
%     escaping. Perhaps the characters |!'()*-./0123456789_| are safe,
%     and all other characters should be escaped?
%   \item Automate generation of 8-bit mapping files.
%   \item Change the framework for 8-bit encodings: for decoding from
%     8-bit to Unicode, use $256$ integer registers; for encoding, use a
%     tree-box.
%   \item More encodings (see Heiko's \pkg{stringenc}). CESU?
%   \item More escapings: \textsc{ascii85}, shell escapes, lua escapes,
%     \emph{etc.}?
% \end{itemize}
%
% \end{documentation}
%
% \begin{implementation}
%
% \section{\pkg{l3str-convert} implementation}
%
%    \begin{macrocode}
%<*package>
%    \end{macrocode}
%
%    \begin{macrocode}
%<@@=str>
%    \end{macrocode}
%
% \subsection{Helpers}
%
% \subsubsection{Variables and constants}
%
% \begin{macro}{\@@_tmp:w}
% \begin{variable}{\l_@@_internal_tl}
%   Internal scratch space for some functions.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_tmp:w { }
\tl_new:N \l_@@_internal_tl
%    \end{macrocode}
% \end{variable}
% \end{macro}
%
% \begin{variable}{\g_@@_result_tl}
%   The \cs{g_@@_result_tl} variable is used to hold the result of
%   various internal string operations (mostly conversions) which are
%   typically performed in a group. The variable is global so that it
%   remains defined outside the group, to be assigned to a user-provided
%   variable.
%    \begin{macrocode}
\tl_new:N \g_@@_result_tl
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}{\c_@@_replacement_char_int}
%   When converting, invalid bytes are replaced by the Unicode
%   replacement character \hexnum{FFFD}.
%    \begin{macrocode}
\int_const:Nn \c_@@_replacement_char_int { "FFFD }
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}{\c_@@_max_byte_int}
%   The maximal byte number.
%    \begin{macrocode}
\int_const:Nn \c_@@_max_byte_int { 255 }
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}{\s_@@}
%   Internal scan marks.
%    \begin{macrocode}
\scan_new:N \s_@@
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}{\q_@@_nil}
%   Internal quarks.
%    \begin{macrocode}
\quark_new:N \q_@@_nil
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}{\g_@@_alias_prop}
%   To avoid needing one file per encoding/escaping alias, we keep track
%   of those in a property list.
%    \begin{macrocode}
\prop_new:N \g_@@_alias_prop
\prop_gput:Nnn \g_@@_alias_prop { latin1 } { iso88591 }
\prop_gput:Nnn \g_@@_alias_prop { latin2 } { iso88592 }
\prop_gput:Nnn \g_@@_alias_prop { latin3 } { iso88593 }
\prop_gput:Nnn \g_@@_alias_prop { latin4 } { iso88594 }
\prop_gput:Nnn \g_@@_alias_prop { latin5 } { iso88599 }
\prop_gput:Nnn \g_@@_alias_prop { latin6 } { iso885910 }
\prop_gput:Nnn \g_@@_alias_prop { latin7 } { iso885913 }
\prop_gput:Nnn \g_@@_alias_prop { latin8 } { iso885914 }
\prop_gput:Nnn \g_@@_alias_prop { latin9 } { iso885915 }
\prop_gput:Nnn \g_@@_alias_prop { latin10 } { iso885916 }
\prop_gput:Nnn \g_@@_alias_prop { utf16le } { utf16 }
\prop_gput:Nnn \g_@@_alias_prop { utf16be } { utf16 }
\prop_gput:Nnn \g_@@_alias_prop { utf32le } { utf32 }
\prop_gput:Nnn \g_@@_alias_prop { utf32be } { utf32 }
\prop_gput:Nnn \g_@@_alias_prop { hexadecimal } { hex }
\bool_lazy_any:nTF
  {
    \sys_if_engine_luatex_p:
    \sys_if_engine_xetex_p:
  }
  {
    \prop_gput:Nnn \g_@@_alias_prop { default } {  }
  }
  {
    \prop_gput:Nnn \g_@@_alias_prop { default } { utf8 }
  }
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}{\g_@@_error_bool}
%   In conversion functions with a built-in conditional, errors are not
%   reported directly to the user, but the information is collected in
%   this boolean, used at the end to decide on which branch of the
%   conditional to take.
%    \begin{macrocode}
\bool_new:N \g_@@_error_bool
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}{\l_@@_byte_flag, \l_@@_error_flag}
%   Conversions from one \meta{encoding}/\meta{escaping} pair to another
%   are done within \texttt{e}-expanding assignments. Errors are
%   signalled by raising the relevant flag.
%    \begin{macrocode}
\flag_new:N \l_@@_byte_flag
\flag_new:N \l_@@_error_flag
%    \end{macrocode}
% \end{variable}
%
% \subsection{String conditionals}
%
% \begin{macro}[EXP]{\@@_if_contains_char:NnT, \@@_if_contains_char:NnTF}
% \begin{macro}[EXP]{\@@_if_contains_char:nnTF}
% \begin{macro}[EXP]{\@@_if_contains_char_aux:nn,\@@_if_contains_char_auxi:nN}
% \begin{macro}[EXP]{\@@_if_contains_char_true:}
%   \begin{syntax}
%     \cs{@@_if_contains_char:nnTF} \Arg{token list} \meta{char}
%   \end{syntax}
%   Expects the \meta{token list} to be an \meta{other string}: the
%   caller is responsible for ensuring that no (too-)special catcodes
%   remain.
%   Loop over the characters of the string, comparing character codes.
%   The loop is broken if character codes match. Otherwise we return
%   \enquote{false}.
%    \begin{macrocode}
\prg_new_conditional:Npnn \@@_if_contains_char:Nn #1#2 { T , TF }
  {
    \exp_after:wN \@@_if_contains_char_aux:nn \exp_after:wN {#1} {#2}
      { \prg_break:n { ? \fi: } }
    \prg_break_point:
    \prg_return_false:
  }
\cs_new:Npn \@@_if_contains_char_aux:nn #1#2
  { \@@_if_contains_char_auxi:nN {#2} #1 }
\prg_new_conditional:Npnn \@@_if_contains_char:nn #1#2 { TF }
  {
    \@@_if_contains_char_auxi:nN {#2} #1 { \prg_break:n { ? \fi: } }
    \prg_break_point:
    \prg_return_false:
  }
\cs_new:Npn \@@_if_contains_char_auxi:nN #1#2
  {
    \if_charcode:w #1 #2
      \exp_after:wN \@@_if_contains_char_true:
    \fi:
    \@@_if_contains_char_auxi:nN {#1}
  }
\cs_new:Npn \@@_if_contains_char_true:
  { \prg_break:n { \prg_return_true: \use_none:n } }
%    \end{macrocode}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
%
% \begin{macro}[rEXP]{\@@_octal_use:NTF}
%   \begin{syntax}
%     \cs{@@_octal_use:NTF} \meta{token} \Arg{true code} \Arg{false code}
%   \end{syntax}
%   If the \meta{token} is an octal digit, it is left in the input
%   stream, \emph{followed} by the \meta{true code}. Otherwise, the
%   \meta{false code} is left in the input stream.
%   \begin{texnote}
%     This function will fail if the escape character is an octal
%     digit. We are thus careful to set the escape character to a known
%     value before using it.
%   \end{texnote}
%   \TeX{} dutifully detects octal digits for us: if |#1| is an octal
%   digit, then the right-hand side of the comparison is |'1#1|, greater
%   than $1$. Otherwise, the right-hand side stops as |'1|, and the
%   conditional takes the \texttt{false} branch.
%    \begin{macrocode}
\prg_new_conditional:Npnn \@@_octal_use:N #1 { TF }
  {
    \if_int_compare:w 1 < '1 \token_to_str:N #1 \exp_stop_f:
      #1 \prg_return_true:
    \else:
      \prg_return_false:
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}[rEXP]{\@@_hexadecimal_use:NTF}
%   \TeX{} detects uppercase hexadecimal digits for us (see
%   \cs{@@_octal_use:NTF}), but not the lowercase letters, which we
%   need to detect and replace by their uppercase counterpart.
%    \begin{macrocode}
\prg_new_conditional:Npnn \@@_hexadecimal_use:N #1 { TF }
  {
    \if_int_compare:w 1 < "1 \token_to_str:N #1 \exp_stop_f:
      #1 \prg_return_true:
    \else:
      \if_case:w \int_eval:n { \exp_after:wN ` \token_to_str:N #1 - `a }
           A
      \or: B
      \or: C
      \or: D
      \or: E
      \or: F
      \else:
        \prg_return_false:
        \exp_after:wN \use_none:n
      \fi:
      \prg_return_true:
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \subsection{Conversions}
%
% \subsubsection{Producing one byte or character}
%
% \begin{variable}{\c_@@_byte_0_tl, \c_@@_byte_1_tl, \c_@@_byte_255_tl}
% \begin{variable}{\c_@@_byte_-1_tl}
%   For each integer $N$ in the range $[0,255]$, we create a constant
%   token list which holds three character tokens with category code
%   other: the character with character code $N$, followed by the
%   representation of $N$ as two hexadecimal digits.  The value $-1$ is
%   given a default token list which ensures that later functions give
%   an empty result for the input $-1$.
%    \begin{macrocode}
\group_begin:
  \__kernel_tl_set:Nx \l_@@_internal_tl { \tl_to_str:n { 0123456789ABCDEF } }
  \tl_map_inline:Nn \l_@@_internal_tl
    {
      \tl_map_inline:Nn \l_@@_internal_tl
        {
          \tl_const:ce { c_@@_byte_ \int_eval:n {"#1##1} _tl }
            { \char_generate:nn { "#1##1 } { 12 } #1 ##1 }
        }
    }
\group_end:
\tl_const:cn { c_@@_byte_-1_tl } { { } \use_none:n { } }
%    \end{macrocode}
% \end{variable}
% \end{variable}
%
% \begin{macro}[EXP]{\@@_output_byte:n}
% \begin{macro}[EXP]{\@@_output_byte:w}
% \begin{macro}[EXP]{\@@_output_hexadecimal:n}
% \begin{macro}[EXP]{\@@_output_end:}
%   Those functions must be used carefully: feeding them a value outside
%   the range $[-1,255]$ will attempt to use the undefined token list
%   variable \cs{c_@@_byte_\meta{number}_tl}. Assuming that the
%   argument is in the right range, we expand the corresponding token
%   list, and pick either the byte (first token) or the hexadecimal
%   representations (second and third tokens). The value $-1$ produces
%   an empty result in both cases.
%    \begin{macrocode}
\cs_new:Npn \@@_output_byte:n #1
  { \@@_output_byte:w #1 \@@_output_end: }
\cs_new:Npn \@@_output_byte:w
  {
    \exp_after:wN \exp_after:wN
    \exp_after:wN \use_i:nnn
    \cs:w c_@@_byte_ \int_eval:w
  }
\cs_new:Npn \@@_output_hexadecimal:n #1
  {
    \exp_after:wN \exp_after:wN
    \exp_after:wN \use_none:n
    \cs:w c_@@_byte_ \int_eval:n {#1} _tl \cs_end:
  }
\cs_new:Npn \@@_output_end:
  { \scan_stop: _tl \cs_end: }
%    \end{macrocode}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
%
% \begin{macro}[rEXP]{\@@_output_byte_pair_be:n}
% \begin{macro}[rEXP]{\@@_output_byte_pair_le:n}
% \begin{macro}[rEXP]{\@@_output_byte_pair:nnN}
%   Convert a number in the range $[0,65535]$ to a pair of bytes, either
%   big-endian or little-endian.
%    \begin{macrocode}
\cs_new:Npn \@@_output_byte_pair_be:n #1
  {
    \exp_args:Nf \@@_output_byte_pair:nnN
      { \int_div_truncate:nn { #1 } { "100 } } {#1} \use:nn
  }
\cs_new:Npn \@@_output_byte_pair_le:n #1
  {
    \exp_args:Nf \@@_output_byte_pair:nnN
      { \int_div_truncate:nn { #1 } { "100 } } {#1} \use_ii_i:nn
  }
\cs_new:Npn \@@_output_byte_pair:nnN #1#2#3
  {
    #3
      { \@@_output_byte:n { #1 } }
      { \@@_output_byte:n { #2 - #1 * "100 } }
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
% \end{macro}
%
% \subsubsection{Mapping functions for conversions}
%
% \begin{macro}{\@@_convert_gmap:N}
% \begin{macro}[rEXP]{\@@_convert_gmap_loop:NN}
%   This maps the function |#1| over all characters in
%   \cs{g_@@_result_tl}, which should be a byte string in most cases,
%   sometimes a native string.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_convert_gmap:N #1
  {
    \__kernel_tl_gset:Nx \g_@@_result_tl
      {
        \exp_after:wN \@@_convert_gmap_loop:NN
        \exp_after:wN #1
          \g_@@_result_tl { ? \prg_break: }
        \prg_break_point:
      }
  }
\cs_new:Npn \@@_convert_gmap_loop:NN #1#2
  {
    \use_none:n #2
    #1#2
    \@@_convert_gmap_loop:NN #1
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \begin{macro}{\@@_convert_gmap_internal:N}
% \begin{macro}[rEXP]{\@@_convert_gmap_internal_loop:Nw}
%   This maps the function |#1| over all character codes in
%   \cs{g_@@_result_tl}, which must be in the internal representation.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_convert_gmap_internal:N #1
  {
    \__kernel_tl_gset:Nx \g_@@_result_tl
      {
        \exp_after:wN \@@_convert_gmap_internal_loop:Nww
        \exp_after:wN #1
          \g_@@_result_tl \s_@@ \s_@@_stop \prg_break: \s_@@
        \prg_break_point:
      }
  }
\cs_new:Npn \@@_convert_gmap_internal_loop:Nww #1 #2 \s_@@ #3 \s_@@
  {
    \@@_use_none_delimit_by_s_stop:w #3 \s_@@_stop
    #1 {#3}
    \@@_convert_gmap_internal_loop:Nww #1
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \subsubsection{Error-reporting during conversion}
%
% \begin{macro}{\@@_if_flag_error:Nne}
% \begin{macro}{\@@_if_flag_no_error:Nne}
%   When converting using the function \cs{str_set_convert:Nnnn}, errors
%   should be reported to the user after each step in the
%   conversion. Errors are signalled by raising some flag (typically
%   \texttt{@@_error}), so here we test that flag: if it is raised,
%   give the user an error, otherwise remove the arguments. On the other
%   hand, in the conditional functions \cs{str_set_convert:NnnnTF},
%   errors should be suppressed. This is done by changing
%   \cs{@@_if_flag_error:Nne} into \cs{@@_if_flag_no_error:Nne}
%   locally.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_if_flag_error:Nne #1
  {
    \flag_if_raised:NTF #1
      { \msg_error:nne { str } }
      { \use_none:nn }
  }
\cs_new_protected:Npn \@@_if_flag_no_error:Nne #1#2#3
  { \flag_if_raised:NT #1 { \bool_gset_true:N \g_@@_error_bool } }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \begin{macro}[rEXP]{\@@_if_flag_times:NT}
%   At the end of each conversion step, we raise all relevant errors as
%   one error message, built on the fly. The height of each flag
%   indicates how many times a given error was encountered. This
%   function prints |#2| followed by the number of occurrences of an
%   error if it occurred, nothing otherwise.
%    \begin{macrocode}
\cs_new:Npn \@@_if_flag_times:NT #1#2
  { \flag_if_raised:NT #1 { #2~(x \flag_height:N #1 ) } }
%    \end{macrocode}
% \end{macro}
%
% \subsubsection{Framework for conversions}
%
% Most functions in this module expect to be working with
% \enquote{native} strings. Strings can also be stored as bytes, in one
% of many encodings, for instance \textsc{utf8}.  The bytes themselves
% can be expressed in various ways in terms of \TeX{} tokens, for
% instance as pairs of hexadecimal digits. The questions of going from
% arbitrary Unicode code points to bytes, and from bytes to tokens are
% mostly independent.
%
% Conversions are done in four steps:
% \begin{itemize}
%   \item \enquote{unescape} produces a string of bytes;
%   \item \enquote{decode} takes in a string of bytes, and converts it
%     to a list of Unicode characters in an internal representation,
%     with items of the form
%     \begin{quote}
%       \meta{bytes} \cs{s_@@} \meta{Unicode code point} \cs{s_@@}
%     \end{quote}
%     where we have collected the \meta{bytes} which combined to form
%     this particular Unicode character, and the \meta{Unicode code
%       point} is in the range $[0,\hexnum{10FFFF}]$.
%   \item \enquote{encode} encodes the internal list of code points as a
%     byte string in the new encoding;
%   \item \enquote{escape} escapes bytes as requested.
% \end{itemize}
% The process is modified in case one of the encoding is empty (or the
% conversion function has been set equal to the empty encoding because
% it was not found): then the unescape or escape step is ignored, and
% the decode or encode steps work on tokens instead of bytes. Otherwise,
% each step must ensure that it passes a correct byte string or internal
% string to the next step.
%
% \begin{macro}{\str_set_convert:Nnnn, \str_gset_convert:Nnnn}
% \begin{macro}[TF]{\str_set_convert:Nnnn, \str_gset_convert:Nnnn}
% \begin{macro}{\@@_convert:nNNnnn}
%   The input string is stored in \cs{g_@@_result_tl}, then we:
%   unescape and decode; encode and escape; exit the group and store the
%   result in the user's variable. The various conversion functions all
%   act on \cs{g_@@_result_tl}. Errors are silenced for the conditional
%   functions by redefining \cs{@@_if_flag_error:Nne} locally.
%    \begin{macrocode}
\cs_new_protected:Npn \str_set_convert:Nnnn
  { \@@_convert:nNNnnn { } \tl_set_eq:NN }
\cs_new_protected:Npn \str_gset_convert:Nnnn
  { \@@_convert:nNNnnn { } \tl_gset_eq:NN }
\prg_new_protected_conditional:Npnn
    \str_set_convert:Nnnn #1#2#3#4 { T , F , TF }
  {
    \bool_gset_false:N \g_@@_error_bool
    \@@_convert:nNNnnn
      { \cs_set_eq:NN \@@_if_flag_error:Nne \@@_if_flag_no_error:Nne }
      \tl_set_eq:NN #1 {#2} {#3} {#4}
    \bool_if:NTF \g_@@_error_bool \prg_return_false: \prg_return_true:
  }
\prg_new_protected_conditional:Npnn
    \str_gset_convert:Nnnn #1#2#3#4 { T , F , TF }
  {
    \bool_gset_false:N \g_@@_error_bool
    \@@_convert:nNNnnn
      { \cs_set_eq:NN \@@_if_flag_error:Nne \@@_if_flag_no_error:Nne }
      \tl_gset_eq:NN #1 {#2} {#3} {#4}
    \bool_if:NTF \g_@@_error_bool \prg_return_false: \prg_return_true:
  }
\cs_new_protected:Npn \@@_convert:nNNnnn #1#2#3#4#5#6
  {
    \group_begin:
      #1
      \__kernel_tl_gset:Nx \g_@@_result_tl { \__kernel_str_to_other_fast:n {#4} }
      \exp_after:wN \@@_convert:wwwnn
        \tl_to_str:n {#5} /// \s_@@_stop
        { decode } { unescape }
        \prg_do_nothing:
        \@@_convert_decode_:
      \exp_after:wN \@@_convert:wwwnn
        \tl_to_str:n {#6} /// \s_@@_stop
        { encode } { escape }
        \use_ii_i:nn
        \@@_convert_encode_:
        \__kernel_tl_gset:Nx \g_@@_result_tl
          { \tl_to_str:V \g_@@_result_tl }
    \group_end:
    #2 #3 \g_@@_result_tl
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
% \end{macro}
%
% \begin{macro}{\@@_convert:wwwnn}
% \begin{macro}{\@@_convert:NNnNN}
%   The task of \cs{@@_convert:wwwnn} is to split
%   \meta{encoding}/\meta{escaping} pairs into their components, |#1|
%   and |#2|. Calls to \cs{@@_convert:nnn} ensure that the
%   corresponding conversion functions are defined. The third auxiliary
%   does the main work.
%   \begin{itemize}
%     \item |#1| is the encoding conversion function;
%     \item |#2| is the escaping function;
%     \item |#3| is the escaping name for use in an error message;
%     \item |#4| is \cs{prg_do_nothing:} for unescaping/decoding, and
%       \cs{use_ii_i:nn} for encoding/escaping;
%     \item |#5| is the default encoding function (either
%       \enquote{decode} or \enquote{encode}), for which there should be
%       no escaping.
%   \end{itemize}
%   Let us ignore the native encoding for a second. In the
%   unescaping/decoding phase, we want to do |#2#1| in this order, and
%   in the encoding/escaping phase, the order should be reversed:
%   |#4#2#1| does exactly that. If one of the encodings is the default
%   (native), then the escaping should be ignored, with an error if any
%   was given, and only the encoding, |#1|, should be performed.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_convert:wwwnn
    #1 / #2 // #3 \s_@@_stop #4#5
  {
    \@@_convert:nnn {enc} {#4} {#1}
    \@@_convert:nnn {esc} {#5} {#2}
    \exp_args:Ncc \@@_convert:NNnNN
      { @@_convert_#4_#1: } { @@_convert_#5_#2: } {#2}
  }
\cs_new_protected:Npn \@@_convert:NNnNN #1#2#3#4#5
  {
    \if_meaning:w #1 #5
      \tl_if_empty:nF {#3}
        { \msg_error:nne { str } { native-escaping } {#3} }
      #1
    \else:
      #4 #2 #1
    \fi:
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \begin{macro}{\@@_convert:nnn}
% \begin{macro}{\@@_convert:nnnn}
%   The arguments of \cs{@@_convert:nnn} are: \texttt{enc} or
%   \texttt{esc}, used to build filenames, the type of the conversion
%   (unescape, decode, encode, escape), and the encoding or escaping
%   name. If the function is already defined, no need to do anything.
%   Otherwise, filter out all non-alphanumerics in the name, and
%   lowercase it. Feed that, and the same three arguments, to
%   \cs{@@_convert:nnnn}. The task is then to make sure that the
%   conversion function |#3_#1| corresponding to the type |#3| and
%   filtered name |#1| is defined, then set our initial conversion
%   function |#3_#4| equal to that.
%
%   How do we get the |#3_#1| conversion to be defined if it isn't?
%   Two main cases.
%
%   First, if |#1| is a key in \cs{g_@@_alias_prop}, then the value
%   \cs{l_@@_internal_tl} tells us what file to load. Loading is
%   skipped if the file was already read, \emph{i.e.}, if the conversion
%   command based on \cs{l_@@_internal_tl} already exists.  Otherwise,
%   try to load the file; if that fails, there is an error, use the
%   default empty name instead.
%
%   Second, |#1| may be absent from the property list. The
%   \cs{cs_if_exist:cF} test is automatically false, and we search for a
%   file defining the encoding or escaping |#1| (this should allow
%   third-party \texttt{.def} files). If the file is not found, there is
%   an error, use the default empty name instead.
%
%   In all cases, the conversion based on \cs{l_@@_internal_tl} is
%   defined, so we can set the |#3_#1| function equal to that. In some
%   cases (\emph{e.g.}, \texttt{utf16be}), the |#3_#1| function is
%   actually defined within the file we just loaded, and it is different
%   from the \cs{l_@@_internal_tl}-based function: we mustn't clobber
%   that different definition.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_convert:nnn #1#2#3
  {
    \cs_if_exist:cF { @@_convert_#2_#3: }
      {
        \exp_args:Ne \@@_convert:nnnn
          { \@@_convert_lowercase_alphanum:n {#3} }
          {#1} {#2} {#3}
      }
  }
\cs_new_protected:Npn \@@_convert:nnnn #1#2#3#4
  {
    \cs_if_exist:cF { @@_convert_#3_#1: }
      {
        \prop_get:NnNF \g_@@_alias_prop {#1} \l_@@_internal_tl
          { \tl_set:Nn \l_@@_internal_tl {#1} }
        \cs_if_exist:cF { @@_convert_#3_ \l_@@_internal_tl : }
          {
            \file_if_exist:nTF { l3str-#2- \l_@@_internal_tl .def }
              {
                \group_begin:
                  \cctab_select:N \c_code_cctab
                  \file_input:n { l3str-#2- \l_@@_internal_tl .def }
                \group_end:
              }
              {
                \tl_clear:N \l_@@_internal_tl
                \msg_error:nnee { str } { unknown-#2 } {#4} {#1}
              }
          }
        \cs_if_exist:cF { @@_convert_#3_#1: }
          {
            \cs_gset_eq:cc { @@_convert_#3_#1: }
              { @@_convert_#3_ \l_@@_internal_tl : }
          }
      }
    \cs_gset_eq:cc { @@_convert_#3_#4: } { @@_convert_#3_#1: }
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \begin{macro}[rEXP]{\@@_convert_lowercase_alphanum:n}
% \begin{macro}[rEXP]{\@@_convert_lowercase_alphanum_loop:N}
%   This function keeps only letters and digits, with upper case letters
%   converted to lower case.
%    \begin{macrocode}
\cs_new:Npn \@@_convert_lowercase_alphanum:n #1
  {
    \exp_after:wN \@@_convert_lowercase_alphanum_loop:N
      \tl_to_str:n {#1} { ? \prg_break: }
    \prg_break_point:
  }
\cs_new:Npn \@@_convert_lowercase_alphanum_loop:N #1
  {
    \use_none:n #1
    \if_int_compare:w `#1 > `Z \exp_stop_f:
      \if_int_compare:w `#1 > `z \exp_stop_f: \else:
        \if_int_compare:w `#1 < `a \exp_stop_f: \else:
          #1
        \fi:
      \fi:
    \else:
      \if_int_compare:w `#1 < `A \exp_stop_f:
        \if_int_compare:w 1 < 1#1 \exp_stop_f:
          #1
        \fi:
      \else:
        \@@_output_byte:n { `#1 + `a - `A }
      \fi:
    \fi:
    \@@_convert_lowercase_alphanum_loop:N
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \subsubsection{Byte unescape and escape}
%
% Strings of bytes may need to be stored in auxiliary files in safe
% \enquote{escaping} formats. Each such escaping is only loaded as
% needed. By default, on input any non-byte is filtered out, while the
% output simply consists in letting bytes through.
%
% \begin{macro}[rEXP]{\@@_filter_bytes:n}
% \begin{macro}[rEXP]{\@@_filter_bytes_aux:N}
%   In the case of 8-bit engines, every character is a byte.  For
%   Unicode-aware engines, test the character code; non-bytes cause us
%   to raise the flag \cs{l_@@_byte_flag}.  Spaces have already been given
%   the correct category code when this function is called.
%    \begin{macrocode}
\bool_lazy_any:nTF
  {
    \sys_if_engine_luatex_p:
    \sys_if_engine_xetex_p:
  }
  {
    \cs_new:Npn \@@_filter_bytes:n #1
      {
        \@@_filter_bytes_aux:N #1
          { ? \prg_break: }
        \prg_break_point:
      }
    \cs_new:Npn \@@_filter_bytes_aux:N #1
      {
        \use_none:n #1
        \if_int_compare:w `#1 < 256 \exp_stop_f:
          #1
        \else:
          \flag_raise:N \l_@@_byte_flag
        \fi:
        \@@_filter_bytes_aux:N
      }
  }
  { \cs_new_eq:NN \@@_filter_bytes:n \use:n }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \begin{macro}{\@@_convert_unescape_:}
% \begin{macro}{\@@_convert_unescape_bytes:}
%   The simplest unescaping method removes non-bytes from
%   \cs{g_@@_result_tl}.
%    \begin{macrocode}
\bool_lazy_any:nTF
  {
    \sys_if_engine_luatex_p:
    \sys_if_engine_xetex_p:
  }
  {
    \cs_new_protected:Npn \@@_convert_unescape_:
      {
        \flag_clear:N \l_@@_byte_flag
        \__kernel_tl_gset:Nx \g_@@_result_tl
          { \exp_args:No \@@_filter_bytes:n \g_@@_result_tl }
        \@@_if_flag_error:Nne \l_@@_byte_flag { non-byte } { bytes }
      }
  }
  { \cs_new_protected:Npn \@@_convert_unescape_: { } }
\cs_new_eq:NN \@@_convert_unescape_bytes: \@@_convert_unescape_:
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \begin{macro}{\@@_convert_escape_:}
% \begin{macro}{\@@_convert_escape_bytes:}
%   The simplest form of escape leaves the bytes from the previous step
%   of the conversion unchanged.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_convert_escape_: { }
\cs_new_eq:NN \@@_convert_escape_bytes: \@@_convert_escape_:
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \subsubsection{Native strings}
%
% \begin{macro}{\@@_convert_decode_:}
% \begin{macro}[rEXP]{\@@_decode_native_char:N}
%   Convert each character to its character code, one at a time.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_convert_decode_:
  { \@@_convert_gmap:N \@@_decode_native_char:N }
\cs_new:Npn \@@_decode_native_char:N #1
  { #1 \s_@@ \int_value:w `#1 \s_@@ }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \begin{macro}{\@@_convert_encode_:}
% \begin{macro}[EXP]{\@@_encode_native_char:n}
%   The conversion from an internal string to native character tokens
%   basically maps \cs{char_generate:nn} through the code-points, but in
%   non-Unicode-aware engines we use a fall-back character |?| rather
%   than nothing when given a character code outside $[0,255]$.  We
%   detect the presence of bad characters using a flag and only produce
%   a single error after the \texttt{e}-expanding assignment.
%    \begin{macrocode}
\bool_lazy_any:nTF
  {
    \sys_if_engine_luatex_p:
    \sys_if_engine_xetex_p:
  }
  {
    \cs_new_protected:Npn \@@_convert_encode_:
      { \@@_convert_gmap_internal:N \@@_encode_native_char:n }
    \cs_new:Npn \@@_encode_native_char:n #1
      { \char_generate:nn {#1} {12} }
  }
  {
    \cs_new_protected:Npn \@@_convert_encode_:
      {
        \flag_clear:N \l_@@_error_flag
        \@@_convert_gmap_internal:N \@@_encode_native_char:n
        \@@_if_flag_error:Nne \l_@@_error_flag
          { native-overflow } { }
      }
    \cs_new:Npn \@@_encode_native_char:n #1
      {
        \if_int_compare:w #1 > \c_@@_max_byte_int
          \flag_raise:N \l_@@_error_flag
          ?
        \else:
          \char_generate:nn {#1} {12}
        \fi:
      }
    \msg_new:nnnn { str } { native-overflow }
      { Character~code~too~large~for~this~engine. }
      {
        This~engine~only~support~8-bit~characters:~
        valid~character~codes~are~in~the~range~[0,255].~
        To~manipulate~arbitrary~Unicode,~use~LuaTeX~or~XeTeX.
      }
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \subsubsection{\texttt{clist}}
%
% \begin{macro}{\@@_convert_decode_clist:}
% \begin{macro}[rEXP]{\@@_decode_clist_char:n}
%   Convert each integer to the internal form.  We first turn
%   \cs{g_@@_result_tl} into a clist variable, as this avoids problems
%   with leading or trailing commas.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_convert_decode_clist:
  {
    \clist_gset:No \g_@@_result_tl \g_@@_result_tl
    \__kernel_tl_gset:Nx \g_@@_result_tl
      {
        \exp_args:No \clist_map_function:nN
          \g_@@_result_tl \@@_decode_clist_char:n
      }
  }
\cs_new:Npn \@@_decode_clist_char:n #1
  { #1 \s_@@ \int_eval:n {#1} \s_@@ }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \begin{macro}{\@@_convert_encode_clist:}
% \begin{macro}[rEXP]{\@@_encode_clist_char:n}
%   Convert the internal list of character codes to a comma-list of
%   character codes.  The first line produces a comma-list with a
%   leading comma, removed in the next step (this also works in the
%   empty case, since \cs{tl_tail:N} does not trigger an error in this
%   case).
%    \begin{macrocode}
\cs_new_protected:Npn \@@_convert_encode_clist:
  {
    \@@_convert_gmap_internal:N \@@_encode_clist_char:n
    \__kernel_tl_gset:Nx \g_@@_result_tl { \tl_tail:N \g_@@_result_tl }
  }
\cs_new:Npn \@@_encode_clist_char:n #1 { , #1 }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \subsubsection{8-bit encodings}
%
% It is not clear in what situations 8-bit encodings are used, hence it
% is not clear what should be optimized.  The current approach is
% reasonably efficient to convert long strings, and it scales well when
% using many different encodings.
%
% The data needed to support a given 8-bit encoding is stored in a file
% that consists of a single function call
% \begin{quote}\ttfamily
%   \cs{@@_declare_eight_bit_encoding:nnnn} \Arg{name} \Arg{modulo}
%   \Arg{mapping} \Arg{missing}
% \end{quote}
% This declares the encoding \meta{name} to map bytes to Unicode
% characters according to the \meta{mapping}, and map those bytes which
% are not mentioned in the \meta{mapping} either to the replacement
% character (if they appear in \meta{missing}), or to themselves.  The
% \meta{mapping} argument is a token list of pairs \Arg{byte}
% \Arg{Unicode} expressed in uppercase hexadecimal notation.  The
% \meta{missing} argument is a token list of \Arg{byte}.  Every
% \meta{byte} which does not appear in the \meta{mapping} nor the
% \meta{missing} lists maps to itself in Unicode, so for instance the
% \texttt{latin1} encoding has empty \meta{mapping} and \meta{missing}
% lists.  The \meta{modulo} is a (decimal) integer between $256$ and
% $558$ inclusive, modulo which all Unicode code points supported by the
% encodings must be different.
%
% We use two integer arrays per encoding.  When decoding we only use the
% \texttt{decode} integer array, with entry $n+1$ (offset needed because
% integer array indices start at~$1$) equal to the Unicode code point
% that corresponds to the $n$-th byte in the encoding under
% consideration, or $-1$ if the given byte is invalid in this encoding.
% When encoding we use both arrays: upon seeing a code point~$n$, we
% look up the entry ($1$~plus) $n$ modulo some number $M$ in the
% \texttt{encode} array, which tells us the byte that might encode the
% given Unicode code point, then we check in the \texttt{decode} array
% that indeed this byte encodes the Unicode code point we want.  Here,
% $M$ is an encoding-dependent integer between $256$ and $558$ (it turns
% out), chosen so that among the Unicode code points that can be validly
% represented in the given encoding, no pair of code points have the
% same value modulo~$M$.
%
% \begin{macro}
%   {
%     \@@_declare_eight_bit_encoding:nnnn,
%     \@@_declare_eight_bit_aux:NNnnn,
%     \@@_declare_eight_bit_loop:Nnn,
%     \@@_declare_eight_bit_loop:Nn
%   }
%   Loop through both lists of bytes to fill in the \texttt{decode}
%   integer array, then fill the \texttt{encode} array accordingly.
%   For bytes that are invalid in the given encoding, store $-1$ in the
%   \texttt{decode} array.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_declare_eight_bit_encoding:nnnn #1
  {
    \tl_set:Nn \l_@@_internal_tl {#1}
    \cs_new_protected:cpn { @@_convert_decode_#1: }
      { \@@_convert_decode_eight_bit:n {#1} }
    \cs_new_protected:cpn { @@_convert_encode_#1: }
      { \@@_convert_encode_eight_bit:n {#1} }
    \exp_args:Ncc \@@_declare_eight_bit_aux:NNnnn
      { g_@@_decode_#1_intarray } { g_@@_encode_#1_intarray }
  }
\cs_new_protected:Npn \@@_declare_eight_bit_aux:NNnnn #1#2#3#4#5
  {
    \intarray_new:Nn #1 { 256 }
    \int_step_inline:nnn { 0 } { 255 }
      { \intarray_gset:Nnn #1 { 1 + ##1 } {##1} }
    \@@_declare_eight_bit_loop:Nnn #1
      #4 { \s_@@_stop \prg_break: } { }
    \prg_break_point:
    \@@_declare_eight_bit_loop:Nn #1
      #5 { \s_@@_stop \prg_break: }
    \prg_break_point:
    \intarray_new:Nn #2 {#3}
    \int_step_inline:nnn { 0 } { 255 }
      {
        \int_compare:nNnF { \intarray_item:Nn #1 { 1 + ##1 } } = { -1 }
          {
            \intarray_gset:Nnn #2
              {
                1 +
                \int_mod:nn { \intarray_item:Nn #1 { 1 + ##1 } }
                  { \intarray_count:N #2 }
              }
              {##1}
          }
      }
  }
\cs_new_protected:Npn \@@_declare_eight_bit_loop:Nnn #1#2#3
  {
    \@@_use_none_delimit_by_s_stop:w #2 \s_@@_stop
    \intarray_gset:Nnn #1 { 1 + "#2 } { "#3 }
    \@@_declare_eight_bit_loop:Nnn #1
  }
\cs_new_protected:Npn \@@_declare_eight_bit_loop:Nn #1#2
  {
    \@@_use_none_delimit_by_s_stop:w #2 \s_@@_stop
    \intarray_gset:Nnn #1 { 1 + "#2 } { -1 }
    \@@_declare_eight_bit_loop:Nn #1
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_convert_decode_eight_bit:n}
% \begin{macro}[rEXP]{\@@_decode_eight_bit_aux:n, \@@_decode_eight_bit_aux:Nn}
%   The map from bytes to Unicode code points is in the \texttt{decode}
%   array corresponding to the given encoding.  Define \cs{@@_tmp:w} and
%   pass it successively all bytes in the string.  It produces an
%   internal representation with suitable \cs{s_@@} inserted, and the
%   corresponding code point is obtained by looking it up in the integer
%   array.  If the entry is $-1$ then issue a replacement character and
%   raise the flag indicating that there was an error.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_convert_decode_eight_bit:n #1
  {
    \cs_set:Npe \@@_tmp:w
      {
        \exp_not:N \@@_decode_eight_bit_aux:Nn
        \exp_not:c { g_@@_decode_#1_intarray }
      }
    \flag_clear:N \l_@@_error_flag
    \@@_convert_gmap:N \@@_tmp:w
    \@@_if_flag_error:Nne \l_@@_error_flag { decode-8-bit } {#1}
  }
\cs_new:Npn \@@_decode_eight_bit_aux:Nn #1#2
  {
    #2 \s_@@
    \exp_args:Nf \@@_decode_eight_bit_aux:n
      { \intarray_item:Nn #1 { 1 + `#2 } }
    \s_@@
  }
\cs_new:Npn \@@_decode_eight_bit_aux:n #1
  {
    \if_int_compare:w #1 < \c_zero_int
      \flag_raise:N \l_@@_error_flag
      \int_value:w \c_@@_replacement_char_int
    \else:
      #1
    \fi:
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \begin{macro}{\@@_convert_encode_eight_bit:n}
% \begin{macro}[rEXP]{\@@_encode_eight_bit_aux:nnN, \@@_encode_eight_bit_aux:NNn}
%   It is not practical to make an integer array with indices in the
%   full Unicode range, so we work modulo some number, which is simply
%   the size of the \texttt{encode} integer array for the given
%   encoding.  This gives us a candidate byte for representing a given
%   Unicode code point.  Of course taking the modulo leads to collisions
%   so we check in the \texttt{decode} array that the byte we got is
%   indeed correct.  Otherwise the Unicode code point we started from is
%   simply not representable in the given encoding.
%    \begin{macrocode}
\int_new:N \l_@@_modulo_int
\cs_new_protected:Npn \@@_convert_encode_eight_bit:n #1
  {
    \cs_set:Npe \@@_tmp:w
      {
        \exp_not:N \@@_encode_eight_bit_aux:NNn
        \exp_not:c { g_@@_encode_#1_intarray }
        \exp_not:c { g_@@_decode_#1_intarray }
      }
    \flag_clear:N \l_@@_error_flag
    \@@_convert_gmap_internal:N \@@_tmp:w
    \@@_if_flag_error:Nne \l_@@_error_flag { encode-8-bit } {#1}
  }
\cs_new:Npn \@@_encode_eight_bit_aux:NNn #1#2#3
  {
    \exp_args:Nf \@@_encode_eight_bit_aux:nnN
      {
        \intarray_item:Nn #1
          { 1 + \int_mod:nn {#3} { \intarray_count:N #1 } }
      }
      {#3}
      #2
  }
\cs_new:Npn \@@_encode_eight_bit_aux:nnN #1#2#3
  {
    \int_compare:nNnTF { \intarray_item:Nn #3 { 1 + #1 } } = {#2}
      { \@@_output_byte:n {#1} }
      { \flag_raise:N \l_@@_error_flag }
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \subsection{Messages}
%
% General messages, and messages for the encodings and escapings loaded
% by default (\enquote{native}, and \enquote{bytes}).
%    \begin{macrocode}
\msg_new:nnn { str } { unknown-esc }
  { Escaping~scheme~'#1'~(filtered:~'#2')~unknown. }
\msg_new:nnn { str } { unknown-enc }
  { Encoding~scheme~'#1'~(filtered:~'#2')~unknown. }
\msg_new:nnnn { str } { native-escaping }
  { The~'native'~encoding~scheme~does~not~support~any~escaping. }
  {
    Since~native~strings~do~not~consist~in~bytes,~
    none~of~the~escaping~methods~make~sense.~
    The~specified~escaping,~'#1',~will~be~ignored.
  }
\msg_new:nnn { str } { file-not-found }
  { File~'l3str-#1.def'~not~found. }
%    \end{macrocode}
%
% Message used when the \enquote{bytes} unescaping fails because the
% string given to \cs{str_set_convert:Nnnn} contains a non-byte. This
% cannot happen for the -8-bit engines.
% Messages used for other escapings and
% encodings are defined in each definition file.
%    \begin{macrocode}
\bool_lazy_any:nT
  {
    \sys_if_engine_luatex_p:
    \sys_if_engine_xetex_p:
  }
  {
    \msg_new:nnnn { str } { non-byte }
      { String~invalid~in~escaping~'#1':~it~may~only~contain~bytes. }
      {
        Some~characters~in~the~string~you~asked~to~convert~are~not~
        8-bit~characters.~Perhaps~the~string~is~a~'native'~Unicode~string?~
        If~it~is,~try~using\\
        \\
        \iow_indent:n
          {
            \iow_char:N\\str_set_convert:Nnnn \\
            \ \ <str~var>~\{~<string>~\}~\{~native~\}~\{~<target~encoding>~\}
          }
      }
  }
%    \end{macrocode}
%
% Those messages are used when converting to and from 8-bit encodings.
%    \begin{macrocode}
\msg_new:nnnn { str } { decode-8-bit }
  { Invalid~string~in~encoding~'#1'. }
  {
    LaTeX~came~across~a~byte~which~is~not~defined~to~represent~
    any~character~in~the~encoding~'#1'.
  }
\msg_new:nnnn { str } { encode-8-bit }
  { Unicode~string~cannot~be~converted~to~encoding~'#1'. }
  {
    The~encoding~'#1'~only~contains~a~subset~of~all~Unicode~characters.~
    LaTeX~was~asked~to~convert~a~string~to~that~encoding,~but~that~
    string~contains~a~character~that~'#1'~does~not~support.
  }
%    \end{macrocode}
%
% \subsection{Escaping definitions}
%
% Several of those encodings are defined by the pdf file format.  The
% following byte storage methods are defined:
% \begin{itemize}
%   \item \texttt{bytes} (default), non-bytes are filtered out, and
%     bytes are left untouched (this is defined by default);
%   \item \texttt{hex} or \texttt{hexadecimal}, as per the \pdfTeX{}
%     primitive \tn{pdfescapehex}
%   \item \texttt{name}, as per the \pdfTeX{} primitive
%     \tn{pdfescapename}
%   \item \texttt{string}, as per the \pdfTeX{} primitive
%     \tn{pdfescapestring}
%   \item \texttt{url}, as per the percent encoding of urls.
% \end{itemize}
%
% \subsubsection{Unescape methods}
%
% \begin{macro}{\@@_convert_unescape_hex:}
% \begin{macro}[rEXP]{\@@_unescape_hex_auxi:N}
% \begin{macro}[rEXP]{\@@_unescape_hex_auxii:N}
%   Take chars two by two, and interpret each pair as the hexadecimal
%   code for a byte. Anything else than hexadecimal digits is ignored,
%   raising the flag.  A string which contains an odd number of
%   hexadecimal digits gets |0| appended to it: this is equivalent to
%   appending a |0| in all cases, and dropping it if it is alone.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_convert_unescape_hex:
  {
    \group_begin:
      \flag_clear:N \l_@@_error_flag
      \int_set:Nn \tex_escapechar:D { 92 }
      \__kernel_tl_gset:Nx \g_@@_result_tl
        {
          \@@_output_byte:w "
            \exp_last_unbraced:Nf \@@_unescape_hex_auxi:N
              { \tl_to_str:N \g_@@_result_tl }
            0 { ? 0 - 1 \prg_break: }
            \prg_break_point:
          \@@_output_end:
        }
      \@@_if_flag_error:Nne \l_@@_error_flag { unescape-hex } { }
    \group_end:
  }
\cs_new:Npn \@@_unescape_hex_auxi:N #1
  {
    \use_none:n #1
    \@@_hexadecimal_use:NTF #1
      { \@@_unescape_hex_auxii:N }
      {
        \flag_raise:N \l_@@_error_flag
        \@@_unescape_hex_auxi:N
      }
  }
\cs_new:Npn \@@_unescape_hex_auxii:N #1
  {
    \use_none:n #1
    \@@_hexadecimal_use:NTF #1
      {
        \@@_output_end:
        \@@_output_byte:w " \@@_unescape_hex_auxi:N
      }
      {
        \flag_raise:N \l_@@_error_flag
        \@@_unescape_hex_auxii:N
      }
  }
\msg_new:nnnn { str } { unescape-hex }
  { String~invalid~in~escaping~'hex':~only~hexadecimal~digits~allowed. }
  {
    Some~characters~in~the~string~you~asked~to~convert~are~not~
    hexadecimal~digits~(0-9,~A-F,~a-f)~nor~spaces.
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
% \end{macro}
%
% \begin{macro}{\@@_convert_unescape_name:}
% \begin{macro}[rEXP]{\@@_unescape_name_loop:wNN}
% \begin{macro}{\@@_convert_unescape_url:}
% \begin{macro}[rEXP]{\@@_unescape_url_loop:wNN}
%   The \cs{@@_convert_unescape_name:} function replaces each
%   occurrence of |#| followed by two hexadecimal digits in
%   \cs{g_@@_result_tl} by the corresponding byte.  The \texttt{url}
%   function is identical, with escape character |%| instead of |#|.
%   Thus we define the two together. The arguments of \cs{@@_tmp:w} are
%   the character code of |#| or |%| in hexadecimal, the name of the
%   main function to define, and the name of the auxiliary which
%   performs the loop.
%
%   The looping auxiliary |#3| finds the next escape character, reads
%   the following two characters, and tests them. The test
%   \cs{@@_hexadecimal_use:NTF} leaves the upper-case digit in the
%   input stream, hence we surround the test with
%   \cs{@@_output_byte:w}~|"| and \cs{@@_output_end:}.  If both
%   characters are hexadecimal digits, they should be removed before
%   looping: this is done by \cs{use_i:nnn}.  If one of the characters
%   is not a hexadecimal digit, then feed |"#1| to
%   \cs{@@_output_byte:w} to produce the escape character, raise the
%   flag, and call the looping function followed by the two characters
%   (remove \cs{use_i:nnn}).
%    \begin{macrocode}
\cs_set_protected:Npn \@@_tmp:w #1#2#3
  {
    \cs_new_protected:cpn { @@_convert_unescape_#2: }
      {
        \group_begin:
          \flag_clear:N \l_@@_byte_flag
          \flag_clear:N \l_@@_error_flag
          \int_set:Nn \tex_escapechar:D { 92 }
          \__kernel_tl_gset:Nx \g_@@_result_tl
            {
              \exp_after:wN #3 \g_@@_result_tl
                #1 ? { ? \prg_break: }
              \prg_break_point:
            }
          \@@_if_flag_error:Nne \l_@@_byte_flag { non-byte } { #2 }
          \@@_if_flag_error:Nne \l_@@_error_flag { unescape-#2 } { }
        \group_end:
      }
    \cs_new:Npn #3 ##1#1##2##3
      {
        \@@_filter_bytes:n {##1}
        \use_none:n ##3
        \@@_output_byte:w "
          \@@_hexadecimal_use:NTF ##2
            {
              \@@_hexadecimal_use:NTF ##3
                { }
                {
                  \flag_raise:N \l_@@_error_flag
                  * 0 + `#1 \use_i:nn
                }
            }
            {
              \flag_raise:N \l_@@_error_flag
              0 + `#1 \use_i:nn
            }
        \@@_output_end:
        \use_i:nnn #3 ##2##3
      }
    \msg_new:nnnn { str } { unescape-#2 }
      { String~invalid~in~escaping~'#2'. }
      {
        LaTeX~came~across~the~escape~character~'#1'~not~followed~by~
        two~hexadecimal~digits.~This~is~invalid~in~the~escaping~'#2'.
      }
  }
\exp_after:wN \@@_tmp:w \c_hash_str { name }
  \@@_unescape_name_loop:wNN
\exp_after:wN \@@_tmp:w \c_percent_str { url }
  \@@_unescape_url_loop:wNN
%    \end{macrocode}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
%
% \begin{macro}{\@@_convert_unescape_string:}
% \begin{macro}[rEXP]{\@@_unescape_string_newlines:wN}
% \begin{macro}[rEXP]{\@@_unescape_string_loop:wNNN}
% \begin{macro}[rEXP]{\@@_unescape_string_repeat:NNNNNN}
%   The \texttt{string} escaping is somewhat similar to the
%   \texttt{name} and \texttt{url} escapings, with escape character |\|.
%   The first step is to convert all three line endings, |^^J|, |^^M|,
%   and |^^M^^J| to the common |^^J|, as per the \textsc{pdf}
%   specification.  This step cannot raise the flag.
%
%   Then the following escape sequences are decoded.
%   \begin{itemize}\def\makelabel#1{\hss\llap{\ttfamily\string#1}}
%     \item[\n] Line feed ($10$)
%     \item[\r] Carriage return ($13$)
%     \item[\t] Horizontal tab ($9$)
%     \item[\b] Backspace ($8$)
%     \item[\f] Form feed ($12$)
%     \item[\(] Left parenthesis
%     \item[\)] Right parenthesis
%     \item[\\] Backslash
%     \item[\ddd] (backslash followed by $1$ to $3$ octal digits) Byte
%       \texttt{ddd} (octal), subtracting $256$ in case of overflow.
%   \end{itemize}
%   If followed by an end-of-line character, the backslash and the
%   end-of-line are ignored. If followed by anything else, the backslash
%   is ignored, raising the error flag.
%    \begin{macrocode}
\group_begin:
  \char_set_catcode_other:N \^^J
  \char_set_catcode_other:N \^^M
  \cs_set_protected:Npn \@@_tmp:w #1
    {
      \cs_new_protected:Npn \@@_convert_unescape_string:
        {
          \group_begin:
            \flag_clear:N \l_@@_byte_flag
            \flag_clear:N \l_@@_error_flag
            \int_set:Nn \tex_escapechar:D { 92 }
            \__kernel_tl_gset:Nx \g_@@_result_tl
              {
                \exp_after:wN \@@_unescape_string_newlines:wN
                  \g_@@_result_tl \prg_break: ^^M ?
                \prg_break_point:
              }
            \__kernel_tl_gset:Nx \g_@@_result_tl
              {
                \exp_after:wN \@@_unescape_string_loop:wNNN
                  \g_@@_result_tl #1 ?? { ? \prg_break: }
                \prg_break_point:
              }
            \@@_if_flag_error:Nne \l_@@_byte_flag { non-byte } { string }
            \@@_if_flag_error:Nne \l_@@_error_flag { unescape-string } { }
          \group_end:
        }
    }
  \exp_args:No \@@_tmp:w { \c_backslash_str }
  \exp_last_unbraced:NNNNo
    \cs_new:Npn \@@_unescape_string_loop:wNNN #1 \c_backslash_str #2#3#4
        {
          \@@_filter_bytes:n {#1}
          \use_none:n #4
          \@@_output_byte:w '
            \@@_octal_use:NTF #2
              {
                \@@_octal_use:NTF #3
                  {
                    \@@_octal_use:NTF #4
                      {
                        \if_int_compare:w #2 > 3 \exp_stop_f:
                          - 256
                        \fi:
                        \@@_unescape_string_repeat:NNNNNN
                      }
                      { \@@_unescape_string_repeat:NNNNNN ? }
                  }
                  { \@@_unescape_string_repeat:NNNNNN ?? }
              }
              {
                \str_case_e:nnF {#2}
                  {
                    { \c_backslash_str } { 134 }
                    { ( } { 50 }
                    { ) } { 51 }
                    { r } { 15 }
                    { f } { 14 }
                    { n } { 12 }
                    { t } { 11 }
                    { b } { 10 }
                    { ^^J } { 0 - 1 }
                  }
                  {
                    \flag_raise:N \l_@@_error_flag
                    0 - 1 \use_i:nn
                  }
              }
          \@@_output_end:
          \use_i:nn \@@_unescape_string_loop:wNNN #2#3#4
        }
  \cs_new:Npn \@@_unescape_string_repeat:NNNNNN #1#2#3#4#5#6
    { \@@_output_end: \@@_unescape_string_loop:wNNN }
  \cs_new:Npn \@@_unescape_string_newlines:wN #1 ^^M #2
    {
      #1
      \if_charcode:w ^^J #2 \else: ^^J \fi:
      \@@_unescape_string_newlines:wN #2
    }
  \msg_new:nnnn { str } { unescape-string }
    { String~invalid~in~escaping~'string'. }
    {
      LaTeX~came~across~an~escape~character~'\c_backslash_str'~
      not~followed~by~any~of:~'n',~'r',~'t',~'b',~'f',~'(',~')',~
      '\c_backslash_str',~one~to~three~octal~digits,~or~the~end~
      of~a~line.
    }
\group_end:
%    \end{macrocode}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
%
% \subsubsection{Escape methods}
%
% Currently, none of the escape methods can lead to errors, assuming
% that their input is made out of bytes.
%
% \begin{macro}{\@@_convert_escape_hex:}
% \begin{macro}[rEXP]{\@@_escape_hex_char:N}
%   Loop and convert each byte to hexadecimal.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_convert_escape_hex:
  { \@@_convert_gmap:N \@@_escape_hex_char:N }
\cs_new:Npn \@@_escape_hex_char:N #1
  { \@@_output_hexadecimal:n { `#1 } }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \begin{macro}{\@@_convert_escape_name:}
% \begin{macro}[rEXP]{\@@_escape_name_char:n}
% \begin{macro}[rEXP]{\@@_if_escape_name:nTF}
% \begin{variable}{\c_@@_escape_name_str}
% \begin{variable}{\c_@@_escape_name_not_str}
%   For each byte, test whether it should be output as is, or be
%   \enquote{hash-encoded}.  Roughly, bytes outside the range
%   $[\hexnum{2A},\hexnum{7E}]$ are hash-encoded. We keep two lists of
%   exceptions: characters in \cs{c_@@_escape_name_not_str} are not
%   hash-encoded, and characters in the \cs{c_@@_escape_name_str} are
%   encoded.
%    \begin{macrocode}
\str_const:Nn \c_@@_escape_name_not_str { ! " $ & ' } %$
\str_const:Nn \c_@@_escape_name_str { {}/<>[] }
\cs_new_protected:Npn \@@_convert_escape_name:
  { \@@_convert_gmap:N \@@_escape_name_char:n }
\cs_new:Npn \@@_escape_name_char:n #1
  {
    \@@_if_escape_name:nTF {#1} {#1}
      { \c_hash_str \@@_output_hexadecimal:n {`#1} }
  }
\prg_new_conditional:Npnn \@@_if_escape_name:n #1 { TF }
  {
    \if_int_compare:w `#1 < "2A \exp_stop_f:
      \@@_if_contains_char:NnTF \c_@@_escape_name_not_str {#1}
        \prg_return_true: \prg_return_false:
    \else:
      \if_int_compare:w `#1 > "7E \exp_stop_f:
        \prg_return_false:
      \else:
        \@@_if_contains_char:NnTF \c_@@_escape_name_str {#1}
          \prg_return_false: \prg_return_true:
      \fi:
    \fi:
  }
%    \end{macrocode}
% \end{variable}
% \end{variable}
% \end{macro}
% \end{macro}
% \end{macro}
%
% \begin{macro}{\@@_convert_escape_string:}
% \begin{macro}[rEXP]{\@@_escape_string_char:N}
% \begin{macro}[rEXP]{\@@_if_escape_string:NTF}
% \begin{variable}{\c_@@_escape_string_str}
%   Any character below (and including) space, and any character above
%   (and including) \texttt{del}, are converted to octal.  One backslash
%   is added before each parenthesis and backslash.
%    \begin{macrocode}
\str_const:Ne \c_@@_escape_string_str
  { \c_backslash_str ( ) }
\cs_new_protected:Npn \@@_convert_escape_string:
  { \@@_convert_gmap:N \@@_escape_string_char:N }
\cs_new:Npn \@@_escape_string_char:N #1
  {
    \@@_if_escape_string:NTF #1
      {
        \@@_if_contains_char:NnT
          \c_@@_escape_string_str {#1}
          { \c_backslash_str }
        #1
      }
      {
        \c_backslash_str
        \int_div_truncate:nn {`#1} {64}
        \int_mod:nn { \int_div_truncate:nn {`#1} { 8 } } { 8 }
        \int_mod:nn {`#1} { 8 }
      }
  }
\prg_new_conditional:Npnn \@@_if_escape_string:N #1 { TF }
  {
    \if_int_compare:w `#1 < "27 \exp_stop_f:
      \prg_return_false:
    \else:
      \if_int_compare:w `#1 > "7A \exp_stop_f:
        \prg_return_false:
      \else:
        \prg_return_true:
      \fi:
    \fi:
  }
%    \end{macrocode}
% \end{variable}
% \end{macro}
% \end{macro}
% \end{macro}
%
% \begin{macro}{\@@_convert_escape_url:}
% \begin{macro}[rEXP]{\@@_escape_url_char:n}
% \begin{macro}[rEXP]{\@@_if_escape_url:nTF}
%   This function is similar to \cs{@@_convert_escape_name:}, escaping
%   different characters.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_convert_escape_url:
  { \@@_convert_gmap:N \@@_escape_url_char:n }
\cs_new:Npn \@@_escape_url_char:n #1
  {
    \@@_if_escape_url:nTF {#1} {#1}
      { \c_percent_str \@@_output_hexadecimal:n { `#1 } }
  }
\prg_new_conditional:Npnn \@@_if_escape_url:n #1 { TF }
  {
    \if_int_compare:w `#1 < "30 \exp_stop_f:
      \@@_if_contains_char:nnTF { "-. } {#1}
        \prg_return_true: \prg_return_false:
    \else:
      \if_int_compare:w `#1 > "7E \exp_stop_f:
        \prg_return_false:
      \else:
        \@@_if_contains_char:nnTF { : ; = ? @ [ ] } {#1}
          \prg_return_false: \prg_return_true:
      \fi:
    \fi:
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
% \end{macro}
%
% \subsection{Encoding definitions}
%
% The \texttt{native} encoding is automatically defined. Other encodings
% are loaded as needed. The following encodings are supported:
% \begin{itemize}
%   \item \textsc{utf-8};
%   \item \textsc{utf-16}, big-, little-endian, or with byte order mark;
%   \item \textsc{utf-32}, big-, little-endian, or with byte order mark;
%   \item the \textsc{iso 8859} code pages, numbered from $1$ to $16$,
%     skipping the inexistent \textsc{iso 8859-12}.
% \end{itemize}
%
% \subsubsection{\textsc{utf-8} support}
%
% \begin{macro}{\@@_convert_encode_utf8:}
% \begin{macro}[rEXP]{\@@_encode_utf_viii_char:n}
% \begin{macro}[rEXP]{\@@_encode_utf_viii_loop:wwnnw}
%   Loop through the internal string, and convert each character to its
%   \textsc{utf-8} representation. The representation is built from the
%   right-most (least significant) byte to the left-most (most
%   significant) byte. Continuation bytes are in the range $[128,191]$,
%   taking $64$ different values, hence we roughly want to express the
%   character code in base $64$, shifting the first digit in the
%   representation by some number depending on how many continuation
%   bytes there are. In the range $[0,127]$, output the corresponding
%   byte directly. In the range $[128,2047]$, output the remainder
%   modulo $64$, plus $128$ as a continuation byte, then output the
%   quotient (which is in the range $[0,31]$), shifted by $192$. In the
%   next range, $[2048,65535]$, split the character code into residue
%   and quotient modulo $64$, output the residue as a first continuation
%   byte, then repeat; this leaves us with a quotient in the range
%   $[0,15]$, which we output shifted by $224$. The last range,
%   $[65536,1114111]$, follows the same pattern: once we realize that
%   dividing twice by $64$ leaves us with a number larger than $15$, we
%   repeat, producing a last continuation byte, and offset the quotient
%   by $240$ for the leading byte.
%
%   How is that implemented? \cs{@@_encode_utf_vii_loop:wwnnw} takes
%   successive quotients as its first argument, the quotient from the
%   previous step as its second argument (except in step~$1$), the bound
%   for quotients that trigger one more step or not, and finally the
%   offset used if this step should produce the leading byte. Leading
%   bytes can be in the ranges $[0,127]$, $[192,223]$, $[224,239]$, and
%   $[240,247]$ (really, that last limit should be $244$ because Unicode
%   stops at the code point $1114111$). At each step, if the quotient
%   |#1| is less than the limit |#3| for that range, output the leading
%   byte (|#1| shifted by |#4|) and stop. Otherwise, we need one more
%   step: use the quotient of |#1| by $64$, and |#1| as arguments for
%   the looping auxiliary, and output the continuation byte
%   corresponding to the remainder $|#2|-64|#1|+128$. The bizarre
%   construction |- 1 + 0 *| removes the spurious initial
%   continuation byte (better methods welcome).
%    \begin{macrocode}
\cs_new_protected:cpn { @@_convert_encode_utf8: }
  { \@@_convert_gmap_internal:N \@@_encode_utf_viii_char:n }
\cs_new:Npn \@@_encode_utf_viii_char:n #1
  {
    \@@_encode_utf_viii_loop:wwnnw #1 ; - 1 + 0 * ;
      { 128 } {       0 }
      {  32 } {     192 }
      {  16 } {     224 }
      {   8 } {     240 }
    \s_@@_stop
  }
\cs_new:Npn \@@_encode_utf_viii_loop:wwnnw #1; #2; #3#4 #5 \s_@@_stop
  {
    \if_int_compare:w #1 < #3 \exp_stop_f:
      \@@_output_byte:n { #1 + #4 }
      \exp_after:wN \@@_use_none_delimit_by_s_stop:w
    \fi:
    \exp_after:wN \@@_encode_utf_viii_loop:wwnnw
      \int_value:w \int_div_truncate:nn {#1} {64} ; #1 ;
      #5 \s_@@_stop
    \@@_output_byte:n { #2 - 64 * ( #1 - 2 ) }
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
% \end{macro}
%
% \begin{variable}
%   {
%     @@_missing  ,
%     @@_extra    ,
%     @@_overlong ,
%     @@_overflow ,
%   }
%   When decoding a string that is purportedly in the \textsc{utf-8}
%   encoding, four different errors can occur, signalled by a specific
%   flag for each (we define those flags using \cs{flag_clear_new:N}
%   rather than \cs{flag_new:N}, because they are shared with other
%   encoding definition files).
%   \begin{itemize}
%     \item \enquote{Missing continuation byte}: a leading byte is not
%       followed by the right number of continuation bytes.
%     \item \enquote{Extra continuation byte}: a continuation byte
%       appears where it was not expected, \emph{i.e.}, not after an
%       appropriate leading byte.
%     \item \enquote{Overlong}: a Unicode character is expressed using
%       more bytes than necessary, for instance, \hexnum{C0}\hexnum{80}
%       for the code point $0$, instead of a single null byte.
%     \item \enquote{Overflow}: this occurs when decoding produces
%       Unicode code points greater than $1114111$.
%   \end{itemize}
%   We only raise one \LaTeX3 error message, combining all the errors
%   which occurred. In the short message, the leading comma must be
%   removed to get a grammatically correct sentence. In the long text,
%   first remind the user what a correct \textsc{utf-8} string should
%   look like, then add error-specific information.
%    \begin{macrocode}
\flag_clear_new:N \l_@@_missing_flag
\flag_clear_new:N \l_@@_extra_flag
\flag_clear_new:N \l_@@_overlong_flag
\flag_clear_new:N \l_@@_overflow_flag
\msg_new:nnnn { str } { utf8-decode }
  {
    Invalid~UTF-8~string:
    \exp_last_unbraced:Nf \use_none:n
      {
        \@@_if_flag_times:NT \l_@@_missing_flag  { ,~missing~continuation~byte }
        \@@_if_flag_times:NT \l_@@_extra_flag    { ,~extra~continuation~byte }
        \@@_if_flag_times:NT \l_@@_overlong_flag { ,~overlong~form }
        \@@_if_flag_times:NT \l_@@_overflow_flag { ,~code~point~too~large }
      }
    .
  }
  {
    In~the~UTF-8~encoding,~each~Unicode~character~consists~in~
    1~to~4~bytes,~with~the~following~bit~pattern: \\
    \iow_indent:n
      {
        Code~point~\ \ \ \ <~128:~0xxxxxxx \\
        Code~point~\ \ \  <~2048:~110xxxxx~10xxxxxx \\
        Code~point~\ \   <~65536:~1110xxxx~10xxxxxx~10xxxxxx \\
        Code~point~    <~1114112:~11110xxx~10xxxxxx~10xxxxxx~10xxxxxx \\
      }
    Bytes~of~the~form~10xxxxxx~are~called~continuation~bytes.
    \flag_if_raised:NT \l_@@_missing_flag
      {
        \\\\
        A~leading~byte~(in~the~range~[192,255])~was~not~followed~by~
        the~appropriate~number~of~continuation~bytes.
      }
    \flag_if_raised:NT \l_@@_extra_flag
      {
        \\\\
        LaTeX~came~across~a~continuation~byte~when~it~was~not~expected.
      }
    \flag_if_raised:NT \l_@@_overlong_flag
      {
        \\\\
        Every~Unicode~code~point~must~be~expressed~in~the~shortest~
        possible~form.~For~instance,~'0xC0'~'0x83'~is~not~a~valid~
        representation~for~the~code~point~3.
      }
    \flag_if_raised:NT \l_@@_overflow_flag
      {
        \\\\
        Unicode~limits~code~points~to~the~range~[0,1114111].
      }
  }
\prop_gput:Nnn \g_msg_module_name_prop { str } { LaTeX }
\prop_gput:Nnn \g_msg_module_type_prop { str } { }
%    \end{macrocode}
% \end{variable}
%
% \begin{macro}{\@@_convert_decode_utf8:}
% \begin{macro}[rEXP]
%   {
%     \@@_decode_utf_viii_start:N,
%     \@@_decode_utf_viii_continuation:wwN,
%     \@@_decode_utf_viii_aux:wNnnwN
%   }
% \begin{macro}[rEXP]
%   {\@@_decode_utf_viii_overflow:w, \@@_decode_utf_viii_end:}
%   Decoding is significantly harder than encoding. As before, lower
%   some flags, which are tested at the end (in bulk, to trigger at most
%   one \LaTeX3 error, as explained above). We expect successive
%   multi-byte sequences of the form \meta{start byte}
%   \meta{continuation bytes}. The \texttt{_start} auxiliary tests the
%   first byte:
%   \begin{itemize}
%     \item $[0,\hexnum{7F}]$: the byte stands alone, and is converted
%       to its own character code;
%     \item $[\hexnum{80}, \hexnum{BF}]$: unexpected continuation byte,
%       raise the appropriate flag, and convert that byte to the
%       replacement character \hexnum{FFFD};
%     \item $[\hexnum{C0}, \hexnum{FF}]$: this byte should be followed
%       by some continuation byte(s).
%   \end{itemize}
%   In the first two cases, \cs{use_none_delimit_by_q_stop:w} removes
%   data that only the third case requires, namely the limits of ranges
%   of Unicode characters which can be expressed with $1$, $2$, $3$, or
%   $4$ bytes.
%
%   We can now concentrate on the multi-byte case and the
%   \texttt{_continuation} auxiliary. We expect |#3| to be in the range
%   $[\hexnum{80}, \hexnum{BF}]$. The test for this goes as follows: if
%   the character code is less than \hexnum{80}, we compare it to
%   $-\hexnum{C0}$, yielding \texttt{false}; otherwise to \hexnum{C0},
%   yielding \texttt{true} in the range $[\hexnum{80}, \hexnum{BF}]$ and
%   \texttt{false} otherwise. If we find that the byte is not a
%   continuation range, stop the current slew of bytes, output the
%   replacement character, and continue parsing with the \texttt{_start}
%   auxiliary, starting at the byte we just tested. Once we know that
%   the byte is a continuation byte, leave it behind us in the input
%   stream, compute what code point the bytes read so far would produce,
%   and feed that number to the \texttt{_aux} function.
%
%   The \texttt{_aux} function tests whether we should look for more
%   continuation bytes or not. If the number it receives as |#1| is less
%   than the maximum |#4| for the current range, then we are done: check
%   for an overlong representation by comparing |#1| with the maximum
%   |#3| for the previous range. Otherwise, we call the
%   \texttt{_continuation} auxiliary again, after shifting the
%   \enquote{current code point} by |#4| (maximum from the range we just
%   checked).
%
%   Two additional tests are needed: if we reach the end of the list of
%   range maxima and we are still not done, then we are faced with an
%   overflow. Clean up, and again insert the code point \hexnum{FFFD}
%   for the replacement character. Also, every time we read a byte, we
%   need to check whether we reached the end of the string. In a correct
%   \textsc{utf-8} string, this happens automatically when the
%   \texttt{_start} auxiliary leaves its first argument in the input
%   stream: the end-marker begins with \cs{prg_break:}, which ends
%   the loop. On the other hand, if the end is reached when looking for
%   a continuation byte, the \cs{use_none:n} |#3| construction removes
%   the first token from the end-marker, and leaves the \texttt{_end}
%   auxiliary, which raises the appropriate error flag before ending the
%   mapping.
%    \begin{macrocode}
\cs_new_protected:cpn { @@_convert_decode_utf8: }
  {
    \flag_clear:N \l_@@_error_flag
    \flag_clear:N \l_@@_missing_flag
    \flag_clear:N \l_@@_extra_flag
    \flag_clear:N \l_@@_overlong_flag
    \flag_clear:N \l_@@_overflow_flag
    \__kernel_tl_gset:Nx \g_@@_result_tl
      {
        \exp_after:wN \@@_decode_utf_viii_start:N \g_@@_result_tl
          { \prg_break: \@@_decode_utf_viii_end: }
        \prg_break_point:
      }
    \@@_if_flag_error:Nne \l_@@_error_flag { utf8-decode } { }
  }
\cs_new:Npn \@@_decode_utf_viii_start:N #1
  {
    #1
    \if_int_compare:w `#1 < "C0 \exp_stop_f:
      \s_@@
      \if_int_compare:w `#1 < "80 \exp_stop_f:
        \int_value:w `#1
      \else:
        \flag_raise:N \l_@@_extra_flag
        \flag_raise:N \l_@@_error_flag
        \int_use:N \c_@@_replacement_char_int
      \fi:
    \else:
      \exp_after:wN \@@_decode_utf_viii_continuation:wwN
      \int_value:w \int_eval:n { `#1 - "C0 } \exp_after:wN
    \fi:
    \s_@@
    \@@_use_none_delimit_by_s_stop:w {"80} {"800} {"10000} {"110000} \s_@@_stop
    \@@_decode_utf_viii_start:N
  }
\cs_new:Npn \@@_decode_utf_viii_continuation:wwN
    #1 \s_@@ #2 \@@_decode_utf_viii_start:N #3
  {
    \use_none:n #3
    \if_int_compare:w `#3 <
          \if_int_compare:w `#3 < "80 \exp_stop_f: - \fi:
          "C0 \exp_stop_f:
      #3
      \exp_after:wN \@@_decode_utf_viii_aux:wNnnwN
      \int_value:w \int_eval:n { #1 * "40 + `#3 - "80 } \exp_after:wN
    \else:
      \s_@@
      \flag_raise:N \l_@@_missing_flag
      \flag_raise:N \l_@@_error_flag
      \int_use:N \c_@@_replacement_char_int
    \fi:
    \s_@@
    #2
    \@@_decode_utf_viii_start:N #3
  }
\cs_new:Npn \@@_decode_utf_viii_aux:wNnnwN
    #1 \s_@@ #2#3#4 #5 \@@_decode_utf_viii_start:N #6
  {
    \if_int_compare:w #1 < #4 \exp_stop_f:
      \s_@@
      \if_int_compare:w #1 < #3 \exp_stop_f:
        \flag_raise:N \l_@@_overlong_flag
        \flag_raise:N \l_@@_error_flag
        \int_use:N \c_@@_replacement_char_int
      \else:
        #1
      \fi:
    \else:
      \if_meaning:w \s_@@_stop #5
        \@@_decode_utf_viii_overflow:w #1
      \fi:
      \exp_after:wN \@@_decode_utf_viii_continuation:wwN
      \int_value:w \int_eval:n { #1 - #4 } \exp_after:wN
    \fi:
    \s_@@
    #2 {#4} #5
    \@@_decode_utf_viii_start:N
  }
\cs_new:Npn \@@_decode_utf_viii_overflow:w #1 \fi: #2 \fi:
  {
    \fi: \fi:
    \flag_raise:N \l_@@_overflow_flag
    \flag_raise:N \l_@@_error_flag
    \int_use:N \c_@@_replacement_char_int
  }
\cs_new:Npn \@@_decode_utf_viii_end:
  {
    \s_@@
    \flag_raise:N \l_@@_missing_flag
    \flag_raise:N \l_@@_error_flag
    \int_use:N \c_@@_replacement_char_int \s_@@
    \prg_break:
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
% \end{macro}
%
% \subsubsection{\textsc{utf-16} support}
%
% The definitions are done in a category code régime where the bytes
% $254$ and $255$ used by the byte order mark have catcode~$12$.
%    \begin{macrocode}
\group_begin:
  \char_set_catcode_other:N \^^fe
  \char_set_catcode_other:N \^^ff
%    \end{macrocode}
%
% \begin{macro}
%   {
%     \@@_convert_encode_utf16:   ,
%     \@@_convert_encode_utf16be: ,
%     \@@_convert_encode_utf16le: ,
%   }
% \begin{macro}[rEXP]
%   {
%     \@@_encode_utf_xvi_aux:N  ,
%     \@@_encode_utf_xvi_char:n ,
%   }
%   When the endianness is not specified, it is big-endian by default,
%   and we add a byte-order mark.  Convert characters one by one in a
%   loop, with different behaviours depending on the character code.
%   \begin{itemize}
%     \item $[0, \hexnum{D7FF}]$: converted to two bytes;
%     \item $[\hexnum{D800}, \hexnum{DFFF}]$ are used as surrogates:
%       they cannot be converted and are replaced by the replacement
%       character;
%     \item $[\hexnum{E000}, \hexnum{FFFF}]$: converted to two bytes;
%     \item $[\hexnum{10000}, \hexnum{10FFFF}]$: converted to a pair of
%       surrogates, each two bytes. The magic \hexnum{D7C0} is
%       $\hexnum{D800}-\hexnum{10000}/\hexnum{400}$.
%   \end{itemize}
%   For the duration of this operation, \cs{@@_tmp:w} is defined as a
%   function to convert a number in the range $[0, \hexnum{FFFF}]$ to a
%   pair of bytes (either big endian or little endian), by feeding the
%   quotient of the division of |#1| by \hexnum{100}, followed by |#1|
%   to \cs{@@_encode_utf_xvi_be:nn} or its \texttt{le} analog: those
%   compute the remainder, and output two bytes for the quotient and
%   remainder.
%    \begin{macrocode}
  \cs_new_protected:cpn { @@_convert_encode_utf16: }
    {
      \@@_encode_utf_xvi_aux:N \@@_output_byte_pair_be:n
      \tl_gput_left:Ne \g_@@_result_tl { ^^fe ^^ff }
    }
  \cs_new_protected:cpn { @@_convert_encode_utf16be: }
    { \@@_encode_utf_xvi_aux:N \@@_output_byte_pair_be:n }
  \cs_new_protected:cpn { @@_convert_encode_utf16le: }
    { \@@_encode_utf_xvi_aux:N \@@_output_byte_pair_le:n }
  \cs_new_protected:Npn \@@_encode_utf_xvi_aux:N #1
    {
      \flag_clear:N \l_@@_error_flag
      \cs_set_eq:NN \@@_tmp:w #1
      \@@_convert_gmap_internal:N \@@_encode_utf_xvi_char:n
      \@@_if_flag_error:Nne \l_@@_error_flag { utf16-encode } { }
    }
  \cs_new:Npn \@@_encode_utf_xvi_char:n #1
    {
      \if_int_compare:w #1 < "D800 \exp_stop_f:
        \@@_tmp:w {#1}
      \else:
        \if_int_compare:w #1 < "10000 \exp_stop_f:
          \if_int_compare:w #1 < "E000 \exp_stop_f:
            \flag_raise:N \l_@@_error_flag
            \@@_tmp:w { \c_@@_replacement_char_int }
          \else:
            \@@_tmp:w {#1}
          \fi:
        \else:
          \exp_args:Nf \@@_tmp:w { \int_div_truncate:nn {#1} {"400} + "D7C0 }
          \exp_args:Nf \@@_tmp:w { \int_mod:nn {#1} {"400} + "DC00 }
        \fi:
      \fi:
    }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \begin{variable}
%   {
%     @@_missing ,
%     @@_extra   ,
%     @@_end     ,
%   }
%   When encoding a Unicode string to \textsc{utf-16}, only one error
%   can occur: code points in the range $[\hexnum{D800},
%   \hexnum{DFFF}]$, corresponding to surrogates, cannot be encoded. We
%   use the all-purpose flag \texttt{@@_error} to signal that error.
%
%   When decoding a Unicode string which is purportedly in
%   \textsc{utf-16}, three errors can occur: a missing trail surrogate,
%   an unexpected trail surrogate, and a string containing an odd number
%   of bytes.
%    \begin{macrocode}
  \flag_clear_new:N \l_@@_missing_flag
  \flag_clear_new:N \l_@@_extra_flag
  \flag_clear_new:N \l_@@_end_flag
  \msg_new:nnnn { str } { utf16-encode }
    { Unicode~string~cannot~be~expressed~in~UTF-16:~surrogate. }
    {
      Surrogate~code~points~(in~the~range~[U+D800,~U+DFFF])~
      can~be~expressed~in~the~UTF-8~and~UTF-32~encodings,~
      but~not~in~the~UTF-16~encoding.
    }
  \msg_new:nnnn { str } { utf16-decode }
    {
      Invalid~UTF-16~string:
      \exp_last_unbraced:Nf \use_none:n
        {
          \@@_if_flag_times:NT \l_@@_missing_flag  { ,~missing~trail~surrogate }
          \@@_if_flag_times:NT \l_@@_extra_flag    { ,~extra~trail~surrogate }
          \@@_if_flag_times:NT \l_@@_end_flag      { ,~odd~number~of~bytes }
        }
      .
    }
    {
      In~the~UTF-16~encoding,~each~Unicode~character~is~encoded~as~
      2~or~4~bytes: \\
      \iow_indent:n
        {
          Code~point~in~[U+0000,~U+D7FF]:~two~bytes \\
          Code~point~in~[U+D800,~U+DFFF]:~illegal \\
          Code~point~in~[U+E000,~U+FFFF]:~two~bytes \\
          Code~point~in~[U+10000,~U+10FFFF]:~
            a~lead~surrogate~and~a~trail~surrogate \\
        }
      Lead~surrogates~are~pairs~of~bytes~in~the~range~[0xD800,~0xDBFF],~
      and~trail~surrogates~are~in~the~range~[0xDC00,~0xDFFF].
      \flag_if_raised:NT \l_@@_missing_flag
        {
          \\\\
          A~lead~surrogate~was~not~followed~by~a~trail~surrogate.
        }
      \flag_if_raised:NT \l_@@_extra_flag
        {
          \\\\
          LaTeX~came~across~a~trail~surrogate~when~it~was~not~expected.
        }
      \flag_if_raised:NT \l_@@_end_flag
        {
          \\\\
          The~string~contained~an~odd~number~of~bytes.~This~is~invalid:~
          the~basic~code~unit~for~UTF-16~is~16~bits~(2~bytes).
        }
    }
%    \end{macrocode}
% \end{variable}
%
% \begin{macro}
%   {
%     \@@_convert_decode_utf16:   ,
%     \@@_convert_decode_utf16be: ,
%     \@@_convert_decode_utf16le: ,
%   }
% \begin{macro}{\@@_decode_utf_xvi_bom:NN, \@@_decode_utf_xvi:Nw}
%   As for \textsc{utf-8}, decoding \textsc{utf-16} is harder than
%   encoding it. If the endianness is unknown, check the first two
%   bytes: if those are \hexnum{FE} and \hexnum{FF} in either order,
%   remove them and use the corresponding endianness, otherwise assume
%   big-endianness. The three endianness cases are based on a common
%   auxiliary whose first argument is $1$ for big-endian and $2$ for
%   little-endian, and whose second argument, delimited by the scan mark
%   \cs{s_@@_stop}, is expanded once (the string may be long; passing
%   \cs{g_@@_result_tl} as an argument before expansion is cheaper).
%
%   The \cs{@@_decode_utf_xvi:Nw} function defines \cs{@@_tmp:w} to
%   take two arguments and return the character code of the first one if
%   the string is big-endian, and the second one if the string is
%   little-endian, then loops over the string using
%   \cs{@@_decode_utf_xvi_pair:NN} described below.
%    \begin{macrocode}
  \cs_new_protected:cpn { @@_convert_decode_utf16be: }
    { \@@_decode_utf_xvi:Nw 1 \g_@@_result_tl \s_@@_stop }
  \cs_new_protected:cpn { @@_convert_decode_utf16le: }
    { \@@_decode_utf_xvi:Nw 2 \g_@@_result_tl \s_@@_stop }
  \cs_new_protected:cpn { @@_convert_decode_utf16: }
    {
      \exp_after:wN \@@_decode_utf_xvi_bom:NN
        \g_@@_result_tl \s_@@_stop \s_@@_stop \s_@@_stop
    }
  \cs_new_protected:Npn \@@_decode_utf_xvi_bom:NN #1#2
    {
      \str_if_eq:nnTF { #1#2 } { ^^ff ^^fe }
        { \@@_decode_utf_xvi:Nw 2 }
        {
          \str_if_eq:nnTF { #1#2 } { ^^fe ^^ff }
            { \@@_decode_utf_xvi:Nw 1 }
            { \@@_decode_utf_xvi:Nw 1 #1#2 }
        }
    }
  \cs_new_protected:Npn \@@_decode_utf_xvi:Nw #1#2 \s_@@_stop
    {
      \flag_clear:N \l_@@_error_flag
      \flag_clear:N \l_@@_missing_flag
      \flag_clear:N \l_@@_extra_flag
      \flag_clear:N \l_@@_end_flag
      \cs_set:Npn \@@_tmp:w ##1 ##2 { ` ## #1 }
      \__kernel_tl_gset:Nx \g_@@_result_tl
        {
          \exp_after:wN \@@_decode_utf_xvi_pair:NN
            #2 \q_@@_nil \q_@@_nil
          \prg_break_point:
        }
      \@@_if_flag_error:Nne \l_@@_error_flag { utf16-decode } { }
    }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \begin{macro}[rEXP]
%   {
%     \@@_decode_utf_xvi_pair:NN     ,
%     \@@_decode_utf_xvi_quad:NNwNN  ,
%     \@@_decode_utf_xvi_pair_end:Nw ,
%   }
% \begin{macro}[rEXP]
%   {
%     \@@_decode_utf_xvi_error:nNN ,
%     \@@_decode_utf_xvi_extra:NNw ,
%   }
%   Bytes are read two at a time. At this stage, |\@@_tmp:w #1#2|
%   expands to the character code of the most significant byte, and we
%   distinguish cases depending on which range it lies in:
%   \begin{itemize}
%     \item $[\hexnum{D8}, \hexnum{DB}]$ signals a lead surrogate, and
%       the integer expression yields $1$ (\eTeX{} rounds ties away from
%       zero);
%     \item $[\hexnum{DC}, \hexnum{DF}]$ signals a trail surrogate,
%       unexpected here, and the integer expression yields $2$;
%     \item any other value signals a code point in the Basic
%       Multilingual Plane, which stands for itself, and the
%       \cs{if_case:w} construction expands to nothing (cases other than
%       $1$ or $2$), leaving the relevant material in the input stream,
%       followed by another call to the \texttt{_pair} auxiliary.
%   \end{itemize}
%   The case of a lead surrogate is treated by the \texttt{_quad}
%   auxiliary, whose arguments |#1|, |#2|, |#4| and |#5| are the four
%   bytes. We expect the most significant byte of |#4#5| to be in the
%   range $[\hexnum{DC}, \hexnum{DF}]$ (trail surrogate). The test is
%   similar to the test used for continuation bytes in the
%   \textsc{utf-8} decoding functions. In the case where |#4#5| is
%   indeed a trail surrogate, leave |#1#2#4#5| \cs{s_@@}
%   \meta{code~point} \cs{s_@@}, and remove the pair |#4#5| before
%   looping with \cs{@@_decode_utf_xvi_pair:NN}. Otherwise, of course,
%   complain about the missing surrogate.
%
%   The magic number \hexnum{D7F7} is such that
%   $\hexnum{D7F7}*\hexnum{400} = \hexnum{D800}*\hexnum{400} +
%   \hexnum{DC00} - \hexnum{10000}$.
%
%   Every time we read a pair of bytes, we test for the end-marker
%   \cs{q_@@_nil}. When reaching the end, we additionally check that the
%   string had an even length. Also, if the end is reached when
%   expecting a trail surrogate, we treat that as a missing surrogate.
%    \begin{macrocode}
  \cs_new:Npn \@@_decode_utf_xvi_pair:NN #1#2
    {
      \if_meaning:w \q_@@_nil #2
        \@@_decode_utf_xvi_pair_end:Nw #1
      \fi:
      \if_case:w
        \int_eval:n { ( \@@_tmp:w #1#2 - "D6 ) / 4 } \scan_stop:
      \or: \exp_after:wN \@@_decode_utf_xvi_quad:NNwNN
      \or: \exp_after:wN \@@_decode_utf_xvi_extra:NNw
      \fi:
      #1#2 \s_@@
      \int_eval:n { "100 * \@@_tmp:w #1#2 + \@@_tmp:w #2#1 } \s_@@
      \@@_decode_utf_xvi_pair:NN
    }
  \cs_new:Npn \@@_decode_utf_xvi_quad:NNwNN
      #1#2 #3 \@@_decode_utf_xvi_pair:NN #4#5
    {
      \if_meaning:w \q_@@_nil #5
        \@@_decode_utf_xvi_error:nNN { missing } #1#2
        \@@_decode_utf_xvi_pair_end:Nw #4
      \fi:
      \if_int_compare:w
          \if_int_compare:w \@@_tmp:w #4#5 < "DC \exp_stop_f:
            0 = 1
          \else:
            \@@_tmp:w #4#5 < "E0
          \fi:
          \exp_stop_f:
        #1 #2 #4 #5 \s_@@
        \int_eval:n
          {
            ( "100 * \@@_tmp:w #1#2 + \@@_tmp:w #2#1 - "D7F7 ) * "400
            + "100 * \@@_tmp:w #4#5 + \@@_tmp:w #5#4
          }
        \s_@@
        \exp_after:wN \use_i:nnn
      \else:
        \@@_decode_utf_xvi_error:nNN { missing } #1#2
      \fi:
      \@@_decode_utf_xvi_pair:NN #4#5
    }
  \cs_new:Npn \@@_decode_utf_xvi_pair_end:Nw #1 \fi:
    {
      \fi:
      \if_meaning:w \q_@@_nil #1
      \else:
        \@@_decode_utf_xvi_error:nNN { end } #1 \prg_do_nothing:
      \fi:
      \prg_break:
    }
  \cs_new:Npn \@@_decode_utf_xvi_extra:NNw #1#2 \s_@@ #3 \s_@@
    { \@@_decode_utf_xvi_error:nNN { extra } #1#2 }
  \cs_new:Npn \@@_decode_utf_xvi_error:nNN #1#2#3
    {
      \flag_raise:N \l_@@_error_flag
      \flag_raise:c { l_@@_#1_flag }
      #2 #3 \s_@@
      \int_use:N \c_@@_replacement_char_int \s_@@
    }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% Restore the original catcodes of bytes $254$ and $255$.
%    \begin{macrocode}
\group_end:
%    \end{macrocode}
%
% \subsubsection{\textsc{utf-32} support}
%
% The definitions are done in a category code régime where the bytes
% $0$, $254$ and $255$ used by the byte order mark have catcode
% \enquote{other}.
%    \begin{macrocode}
\group_begin:
  \char_set_catcode_other:N \^^00
  \char_set_catcode_other:N \^^fe
  \char_set_catcode_other:N \^^ff
%    \end{macrocode}
%
% \begin{macro}
%   {
%     \@@_convert_encode_utf32:   ,
%     \@@_convert_encode_utf32be: ,
%     \@@_convert_encode_utf32le: ,
%   }
% \begin{macro}[rEXP]
%   {
%     \@@_encode_utf_xxxii_be:n      ,
%     \@@_encode_utf_xxxii_be_aux:nn ,
%     \@@_encode_utf_xxxii_le:n      ,
%     \@@_encode_utf_xxxii_le_aux:nn ,
%   }
%   Convert each integer in the comma-list \cs{g_@@_result_tl} to a
%   sequence of four bytes. The functions for big-endian and
%   little-endian encodings are very similar, but the
%   \cs{@@_output_byte:n} instructions are reversed.
%    \begin{macrocode}
  \cs_new_protected:cpn { @@_convert_encode_utf32: }
    {
      \@@_convert_gmap_internal:N \@@_encode_utf_xxxii_be:n
      \tl_gput_left:Ne \g_@@_result_tl { ^^00 ^^00 ^^fe ^^ff }
    }
  \cs_new_protected:cpn { @@_convert_encode_utf32be: }
    { \@@_convert_gmap_internal:N \@@_encode_utf_xxxii_be:n }
  \cs_new_protected:cpn { @@_convert_encode_utf32le: }
    { \@@_convert_gmap_internal:N \@@_encode_utf_xxxii_le:n }
  \cs_new:Npn \@@_encode_utf_xxxii_be:n #1
    {
      \exp_args:Nf \@@_encode_utf_xxxii_be_aux:nn
        { \int_div_truncate:nn {#1} { "100 } } {#1}
    }
  \cs_new:Npn \@@_encode_utf_xxxii_be_aux:nn #1#2
    {
      ^^00
      \@@_output_byte_pair_be:n {#1}
      \@@_output_byte:n { #2 - #1 * "100 }
    }
  \cs_new:Npn \@@_encode_utf_xxxii_le:n #1
    {
      \exp_args:Nf \@@_encode_utf_xxxii_le_aux:nn
        { \int_div_truncate:nn {#1} { "100 } } {#1}
    }
  \cs_new:Npn \@@_encode_utf_xxxii_le_aux:nn #1#2
    {
      \@@_output_byte:n { #2 - #1 * "100 }
      \@@_output_byte_pair_le:n {#1}
      ^^00
    }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \begin{variable}{@@_overflow, @@_end}
%   There can be no error when encoding in \textsc{utf-32}. When
%   decoding, the string may not have length $4n$, or it may contain
%   code points larger than \hexnum{10FFFF}. The latter case often
%   happens if the encoding was in fact not \textsc{utf-32}, because
%   most arbitrary strings are not valid in \textsc{utf-32}.
%    \begin{macrocode}
  \flag_clear_new:N \l_@@_overflow_flag
  \flag_clear_new:N \l_@@_end_flag
  \msg_new:nnnn { str } { utf32-decode }
    {
      Invalid~UTF-32~string:
      \exp_last_unbraced:Nf \use_none:n
        {
          \@@_if_flag_times:NT \l_@@_overflow_flag { ,~code~point~too~large }
          \@@_if_flag_times:NT \l_@@_end_flag      { ,~truncated~string }
        }
      .
    }
    {
      In~the~UTF-32~encoding,~every~Unicode~character~
      (in~the~range~[U+0000,~U+10FFFF])~is~encoded~as~4~bytes.
      \flag_if_raised:NT \l_@@_overflow_flag
        {
          \\\\
          LaTeX~came~across~a~code~point~larger~than~1114111,~
          the~maximum~code~point~defined~by~Unicode.~
          Perhaps~the~string~was~not~encoded~in~the~UTF-32~encoding?
        }
      \flag_if_raised:NT \l_@@_end_flag
        {
          \\\\
          The~length~of~the~string~is~not~a~multiple~of~4.~
          Perhaps~the~string~was~truncated?
        }
    }
%    \end{macrocode}
% \end{variable}
%
% \begin{macro}
%   {
%     \@@_convert_decode_utf32:   ,
%     \@@_convert_decode_utf32be: ,
%     \@@_convert_decode_utf32le: ,
%   }
% \begin{macro}
%   {\@@_decode_utf_xxxii_bom:NNNN, \@@_decode_utf_xxxii:Nw}
% \begin{macro}[rEXP]
%   {\@@_decode_utf_xxxii_loop:NNNN, \@@_decode_utf_xxxii_end:w}
%
%   The structure is similar to \textsc{utf-16} decoding functions. If
%   the endianness is not given, test the first $4$ bytes of the string
%   (possibly \cs{s_@@_stop} if the string is too short) for the presence
%   of a byte-order mark. If there is a byte-order mark, use that
%   endianness, and remove the $4$ bytes, otherwise default to
%   big-endian, and leave the $4$ bytes in place. The
%   \cs{@@_decode_utf_xxxii:Nw} auxiliary receives $1$ or $2$ as its
%   first argument indicating endianness, and the string to convert as
%   its second argument (expanded or not). It sets \cs{@@_tmp:w} to
%   expand to the character code of either of its two arguments
%   depending on endianness, then triggers the \texttt{_loop} auxiliary
%   inside an \texttt{e}-expanding assignment to \cs{g_@@_result_tl}.
%
%   The \texttt{_loop} auxiliary first checks for the end-of-string
%   marker \cs{s_@@_stop}, calling the \texttt{_end} auxiliary if
%   appropriate. Otherwise, leave the \meta{4~bytes} \cs{s_@@} behind,
%   then check that the code point is not overflowing: the leading byte
%   must be $0$, and the following byte at most $16$.
%
%   In the ending code, we check that there remains no byte: there
%   should be nothing left until the first \cs{s_@@_stop}. Break the map.
%    \begin{macrocode}
  \cs_new_protected:cpn { @@_convert_decode_utf32be: }
    { \@@_decode_utf_xxxii:Nw 1 \g_@@_result_tl \s_@@_stop }
  \cs_new_protected:cpn { @@_convert_decode_utf32le: }
    { \@@_decode_utf_xxxii:Nw 2 \g_@@_result_tl \s_@@_stop }
  \cs_new_protected:cpn { @@_convert_decode_utf32: }
    {
      \exp_after:wN \@@_decode_utf_xxxii_bom:NNNN \g_@@_result_tl
        \s_@@_stop \s_@@_stop \s_@@_stop \s_@@_stop \s_@@_stop
    }
  \cs_new_protected:Npn \@@_decode_utf_xxxii_bom:NNNN #1#2#3#4
    {
      \str_if_eq:nnTF { #1#2#3#4 } { ^^ff ^^fe ^^00 ^^00 }
        { \@@_decode_utf_xxxii:Nw 2 }
        {
          \str_if_eq:nnTF { #1#2#3#4 } { ^^00 ^^00 ^^fe ^^ff }
            { \@@_decode_utf_xxxii:Nw 1 }
            { \@@_decode_utf_xxxii:Nw 1 #1#2#3#4 }
        }
    }
  \cs_new_protected:Npn \@@_decode_utf_xxxii:Nw #1#2 \s_@@_stop
    {
      \flag_clear:N \l_@@_overflow_flag
      \flag_clear:N \l_@@_end_flag
      \flag_clear:N \l_@@_error_flag
      \cs_set:Npn \@@_tmp:w ##1 ##2 { ` ## #1 }
      \__kernel_tl_gset:Nx \g_@@_result_tl
        {
          \exp_after:wN \@@_decode_utf_xxxii_loop:NNNN
            #2 \s_@@_stop \s_@@_stop \s_@@_stop \s_@@_stop
          \prg_break_point:
        }
      \@@_if_flag_error:Nne \l_@@_error_flag { utf32-decode } { }
    }
  \cs_new:Npn \@@_decode_utf_xxxii_loop:NNNN #1#2#3#4
    {
      \if_meaning:w \s_@@_stop #4
        \exp_after:wN \@@_decode_utf_xxxii_end:w
      \fi:
      #1#2#3#4 \s_@@
      \if_int_compare:w \@@_tmp:w #1#4 > \c_zero_int
        \flag_raise:N \l_@@_overflow_flag
        \flag_raise:N \l_@@_error_flag
        \int_use:N \c_@@_replacement_char_int
      \else:
        \if_int_compare:w \@@_tmp:w #2#3 > 16 \exp_stop_f:
          \flag_raise:N \l_@@_overflow_flag
          \flag_raise:N \l_@@_error_flag
          \int_use:N \c_@@_replacement_char_int
        \else:
          \int_eval:n
            { \@@_tmp:w #2#3*"10000 + \@@_tmp:w #3#2*"100 + \@@_tmp:w #4#1 }
        \fi:
      \fi:
      \s_@@
      \@@_decode_utf_xxxii_loop:NNNN
    }
  \cs_new:Npn \@@_decode_utf_xxxii_end:w #1 \s_@@_stop
    {
      \tl_if_empty:nF {#1}
        {
          \flag_raise:N \l_@@_end_flag
          \flag_raise:N \l_@@_error_flag
          #1 \s_@@
          \int_use:N \c_@@_replacement_char_int \s_@@
        }
      \prg_break:
    }
%    \end{macrocode}
% \end{macro}
% \end{macro}
% \end{macro}
%
% Restore the original catcodes of bytes $0$, $254$ and $255$.
%    \begin{macrocode}
\group_end:
%    \end{macrocode}
%
% \subsection{PDF names and strings by expansion}
%
% \begin{macro}[EXP]{\str_convert_pdfname:n}
% \begin{macro}[EXP]{\@@_convert_pdfname:n}
% \begin{macro}[EXP]
%   {\@@_convert_pdfname_bytes:n, \@@_convert_pdfname_bytes_aux:n}
% \begin{macro}[EXP]{\@@_convert_pdfname_bytes_aux:nnn}
%   To convert to PDF names by expansion, we work purely on UTF-8 input. The
%   first step is to make a string with \enquote{other} spaces,
%   after which we use a simple token-by-token approach. In Unicode
%   engines, we break down everything before one-byte codepoints, but for
%   $8$-bit engines there is no need to worry. Actual escaping is covered
%   by the same code as used in the non-expandable route.
%    \begin{macrocode}
\cs_new:Npn \str_convert_pdfname:n #1
  {
    \exp_args:Ne \tl_to_str:n
      { \str_map_function:nN {#1} \@@_convert_pdfname:n }
  }
\sys_if_engine_opentype:TF
  {
    \cs_new:Npn \@@_convert_pdfname:n #1
      {
        \int_compare:nNnTF { `#1 } > { "7F }
          { \@@_convert_pdfname_bytes:n {#1} }
          { \@@_escape_name_char:n {#1} }
      }
    \cs_new:Npn \@@_convert_pdfname_bytes:n #1
      {
        \exp_args:Ne \@@_convert_pdfname_bytes_aux:n
          { \__kernel_codepoint_to_bytes:n {`#1} }
      }
    \cs_new:Npn \@@_convert_pdfname_bytes_aux:n #1
      { \@@_convert_pdfname_bytes_aux:nnnn #1 }
    \cs_new:Npe \@@_convert_pdfname_bytes_aux:nnnn #1#2#3#4
      {
        \c_hash_str \exp_not:N \@@_output_hexadecimal:n {#1}
        \c_hash_str \exp_not:N \@@_output_hexadecimal:n {#2}
        \exp_not:N \tl_if_blank:nF {#3}
          {
            \c_hash_str \exp_not:N \@@_output_hexadecimal:n {#3}
            \exp_not:N \tl_if_blank:nF {#4}
              {
                \c_hash_str \exp_not:N \@@_output_hexadecimal:n {#4}
              }
          }
      }
  }
  { \cs_new_eq:NN \@@_convert_pdfname:n \@@_escape_name_char:n }
%    \end{macrocode}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
%
%    \begin{macrocode}
%</package>
%    \end{macrocode}
%
% \subsubsection{\textsc{iso 8859} support}
%
% The \textsc{iso-8859-1} encoding exactly matches with the $256$ first
% Unicode characters. For other 8-bit encodings of the \textsc{iso-8859}
% family, we keep track only of differences, and of unassigned bytes.
%    \begin{macrocode}
%<*iso88591>
\@@_declare_eight_bit_encoding:nnnn { iso88591 } { 256 }
  {
  }
  {
  }
%</iso88591>
%    \end{macrocode}
%
%    \begin{macrocode}
%<*iso88592>
\@@_declare_eight_bit_encoding:nnnn { iso88592 } { 399 }
  {
    { A1 } { 0104 }
    { A2 } { 02D8 }
    { A3 } { 0141 }
    { A5 } { 013D }
    { A6 } { 015A }
    { A9 } { 0160 }
    { AA } { 015E }
    { AB } { 0164 }
    { AC } { 0179 }
    { AE } { 017D }
    { AF } { 017B }
    { B1 } { 0105 }
    { B2 } { 02DB }
    { B3 } { 0142 }
    { B5 } { 013E }
    { B6 } { 015B }
    { B7 } { 02C7 }
    { B9 } { 0161 }
    { BA } { 015F }
    { BB } { 0165 }
    { BC } { 017A }
    { BD } { 02DD }
    { BE } { 017E }
    { BF } { 017C }
    { C0 } { 0154 }
    { C3 } { 0102 }
    { C5 } { 0139 }
    { C6 } { 0106 }
    { C8 } { 010C }
    { CA } { 0118 }
    { CC } { 011A }
    { CF } { 010E }
    { D0 } { 0110 }
    { D1 } { 0143 }
    { D2 } { 0147 }
    { D5 } { 0150 }
    { D8 } { 0158 }
    { D9 } { 016E }
    { DB } { 0170 }
    { DE } { 0162 }
    { E0 } { 0155 }
    { E3 } { 0103 }
    { E5 } { 013A }
    { E6 } { 0107 }
    { E8 } { 010D }
    { EA } { 0119 }
    { EC } { 011B }
    { EF } { 010F }
    { F0 } { 0111 }
    { F1 } { 0144 }
    { F2 } { 0148 }
    { F5 } { 0151 }
    { F8 } { 0159 }
    { F9 } { 016F }
    { FB } { 0171 }
    { FE } { 0163 }
    { FF } { 02D9 }
  }
  {
  }
%</iso88592>
%    \end{macrocode}
%
%    \begin{macrocode}
%<*iso88593>
\@@_declare_eight_bit_encoding:nnnn { iso88593 } { 384 }
  {
    { A1 } { 0126 }
    { A2 } { 02D8 }
    { A6 } { 0124 }
    { A9 } { 0130 }
    { AA } { 015E }
    { AB } { 011E }
    { AC } { 0134 }
    { AF } { 017B }
    { B1 } { 0127 }
    { B6 } { 0125 }
    { B9 } { 0131 }
    { BA } { 015F }
    { BB } { 011F }
    { BC } { 0135 }
    { BF } { 017C }
    { C5 } { 010A }
    { C6 } { 0108 }
    { D5 } { 0120 }
    { D8 } { 011C }
    { DD } { 016C }
    { DE } { 015C }
    { E5 } { 010B }
    { E6 } { 0109 }
    { F5 } { 0121 }
    { F8 } { 011D }
    { FD } { 016D }
    { FE } { 015D }
    { FF } { 02D9 }
  }
  {
    { A5 }
    { AE }
    { BE }
    { C3 }
    { D0 }
    { E3 }
    { F0 }
  }
%</iso88593>
%    \end{macrocode}
%
%    \begin{macrocode}
%<*iso88594>
\@@_declare_eight_bit_encoding:nnnn { iso88594 } { 383 }
  {
    { A1 } { 0104 }
    { A2 } { 0138 }
    { A3 } { 0156 }
    { A5 } { 0128 }
    { A6 } { 013B }
    { A9 } { 0160 }
    { AA } { 0112 }
    { AB } { 0122 }
    { AC } { 0166 }
    { AE } { 017D }
    { B1 } { 0105 }
    { B2 } { 02DB }
    { B3 } { 0157 }
    { B5 } { 0129 }
    { B6 } { 013C }
    { B7 } { 02C7 }
    { B9 } { 0161 }
    { BA } { 0113 }
    { BB } { 0123 }
    { BC } { 0167 }
    { BD } { 014A }
    { BE } { 017E }
    { BF } { 014B }
    { C0 } { 0100 }
    { C7 } { 012E }
    { C8 } { 010C }
    { CA } { 0118 }
    { CC } { 0116 }
    { CF } { 012A }
    { D0 } { 0110 }
    { D1 } { 0145 }
    { D2 } { 014C }
    { D3 } { 0136 }
    { D9 } { 0172 }
    { DD } { 0168 }
    { DE } { 016A }
    { E0 } { 0101 }
    { E7 } { 012F }
    { E8 } { 010D }
    { EA } { 0119 }
    { EC } { 0117 }
    { EF } { 012B }
    { F0 } { 0111 }
    { F1 } { 0146 }
    { F2 } { 014D }
    { F3 } { 0137 }
    { F9 } { 0173 }
    { FD } { 0169 }
    { FE } { 016B }
    { FF } { 02D9 }
  }
  {
  }
%</iso88594>
%    \end{macrocode}
%
%    \begin{macrocode}
%<*iso88595>
\@@_declare_eight_bit_encoding:nnnn { iso88595 } { 374 }
  {
    { A1 } { 0401 }
    { A2 } { 0402 }
    { A3 } { 0403 }
    { A4 } { 0404 }
    { A5 } { 0405 }
    { A6 } { 0406 }
    { A7 } { 0407 }
    { A8 } { 0408 }
    { A9 } { 0409 }
    { AA } { 040A }
    { AB } { 040B }
    { AC } { 040C }
    { AE } { 040E }
    { AF } { 040F }
    { B0 } { 0410 }
    { B1 } { 0411 }
    { B2 } { 0412 }
    { B3 } { 0413 }
    { B4 } { 0414 }
    { B5 } { 0415 }
    { B6 } { 0416 }
    { B7 } { 0417 }
    { B8 } { 0418 }
    { B9 } { 0419 }
    { BA } { 041A }
    { BB } { 041B }
    { BC } { 041C }
    { BD } { 041D }
    { BE } { 041E }
    { BF } { 041F }
    { C0 } { 0420 }
    { C1 } { 0421 }
    { C2 } { 0422 }
    { C3 } { 0423 }
    { C4 } { 0424 }
    { C5 } { 0425 }
    { C6 } { 0426 }
    { C7 } { 0427 }
    { C8 } { 0428 }
    { C9 } { 0429 }
    { CA } { 042A }
    { CB } { 042B }
    { CC } { 042C }
    { CD } { 042D }
    { CE } { 042E }
    { CF } { 042F }
    { D0 } { 0430 }
    { D1 } { 0431 }
    { D2 } { 0432 }
    { D3 } { 0433 }
    { D4 } { 0434 }
    { D5 } { 0435 }
    { D6 } { 0436 }
    { D7 } { 0437 }
    { D8 } { 0438 }
    { D9 } { 0439 }
    { DA } { 043A }
    { DB } { 043B }
    { DC } { 043C }
    { DD } { 043D }
    { DE } { 043E }
    { DF } { 043F }
    { E0 } { 0440 }
    { E1 } { 0441 }
    { E2 } { 0442 }
    { E3 } { 0443 }
    { E4 } { 0444 }
    { E5 } { 0445 }
    { E6 } { 0446 }
    { E7 } { 0447 }
    { E8 } { 0448 }
    { E9 } { 0449 }
    { EA } { 044A }
    { EB } { 044B }
    { EC } { 044C }
    { ED } { 044D }
    { EE } { 044E }
    { EF } { 044F }
    { F0 } { 2116 }
    { F1 } { 0451 }
    { F2 } { 0452 }
    { F3 } { 0453 }
    { F4 } { 0454 }
    { F5 } { 0455 }
    { F6 } { 0456 }
    { F7 } { 0457 }
    { F8 } { 0458 }
    { F9 } { 0459 }
    { FA } { 045A }
    { FB } { 045B }
    { FC } { 045C }
    { FD } { 00A7 }
    { FE } { 045E }
    { FF } { 045F }
  }
  {
  }
%</iso88595>
%    \end{macrocode}
%
%    \begin{macrocode}
%<*iso88596>
\@@_declare_eight_bit_encoding:nnnn { iso88596 } { 344 }
  {
    { AC } { 060C }
    { BB } { 061B }
    { BF } { 061F }
    { C1 } { 0621 }
    { C2 } { 0622 }
    { C3 } { 0623 }
    { C4 } { 0624 }
    { C5 } { 0625 }
    { C6 } { 0626 }
    { C7 } { 0627 }
    { C8 } { 0628 }
    { C9 } { 0629 }
    { CA } { 062A }
    { CB } { 062B }
    { CC } { 062C }
    { CD } { 062D }
    { CE } { 062E }
    { CF } { 062F }
    { D0 } { 0630 }
    { D1 } { 0631 }
    { D2 } { 0632 }
    { D3 } { 0633 }
    { D4 } { 0634 }
    { D5 } { 0635 }
    { D6 } { 0636 }
    { D7 } { 0637 }
    { D8 } { 0638 }
    { D9 } { 0639 }
    { DA } { 063A }
    { E0 } { 0640 }
    { E1 } { 0641 }
    { E2 } { 0642 }
    { E3 } { 0643 }
    { E4 } { 0644 }
    { E5 } { 0645 }
    { E6 } { 0646 }
    { E7 } { 0647 }
    { E8 } { 0648 }
    { E9 } { 0649 }
    { EA } { 064A }
    { EB } { 064B }
    { EC } { 064C }
    { ED } { 064D }
    { EE } { 064E }
    { EF } { 064F }
    { F0 } { 0650 }
    { F1 } { 0651 }
    { F2 } { 0652 }
  }
  {
    { A1 }
    { A2 }
    { A3 }
    { A5 }
    { A6 }
    { A7 }
    { A8 }
    { A9 }
    { AA }
    { AB }
    { AE }
    { AF }
    { B0 }
    { B1 }
    { B2 }
    { B3 }
    { B4 }
    { B5 }
    { B6 }
    { B7 }
    { B8 }
    { B9 }
    { BA }
    { BC }
    { BD }
    { BE }
    { C0 }
    { DB }
    { DC }
    { DD }
    { DE }
    { DF }
  }
%</iso88596>
%    \end{macrocode}
%
%    \begin{macrocode}
%<*iso88597>
\@@_declare_eight_bit_encoding:nnnn { iso88597 } { 498 }
  {
    { A1 } { 2018 }
    { A2 } { 2019 }
    { A4 } { 20AC }
    { A5 } { 20AF }
    { AA } { 037A }
    { AF } { 2015 }
    { B4 } { 0384 }
    { B5 } { 0385 }
    { B6 } { 0386 }
    { B8 } { 0388 }
    { B9 } { 0389 }
    { BA } { 038A }
    { BC } { 038C }
    { BE } { 038E }
    { BF } { 038F }
    { C0 } { 0390 }
    { C1 } { 0391 }
    { C2 } { 0392 }
    { C3 } { 0393 }
    { C4 } { 0394 }
    { C5 } { 0395 }
    { C6 } { 0396 }
    { C7 } { 0397 }
    { C8 } { 0398 }
    { C9 } { 0399 }
    { CA } { 039A }
    { CB } { 039B }
    { CC } { 039C }
    { CD } { 039D }
    { CE } { 039E }
    { CF } { 039F }
    { D0 } { 03A0 }
    { D1 } { 03A1 }
    { D3 } { 03A3 }
    { D4 } { 03A4 }
    { D5 } { 03A5 }
    { D6 } { 03A6 }
    { D7 } { 03A7 }
    { D8 } { 03A8 }
    { D9 } { 03A9 }
    { DA } { 03AA }
    { DB } { 03AB }
    { DC } { 03AC }
    { DD } { 03AD }
    { DE } { 03AE }
    { DF } { 03AF }
    { E0 } { 03B0 }
    { E1 } { 03B1 }
    { E2 } { 03B2 }
    { E3 } { 03B3 }
    { E4 } { 03B4 }
    { E5 } { 03B5 }
    { E6 } { 03B6 }
    { E7 } { 03B7 }
    { E8 } { 03B8 }
    { E9 } { 03B9 }
    { EA } { 03BA }
    { EB } { 03BB }
    { EC } { 03BC }
    { ED } { 03BD }
    { EE } { 03BE }
    { EF } { 03BF }
    { F0 } { 03C0 }
    { F1 } { 03C1 }
    { F2 } { 03C2 }
    { F3 } { 03C3 }
    { F4 } { 03C4 }
    { F5 } { 03C5 }
    { F6 } { 03C6 }
    { F7 } { 03C7 }
    { F8 } { 03C8 }
    { F9 } { 03C9 }
    { FA } { 03CA }
    { FB } { 03CB }
    { FC } { 03CC }
    { FD } { 03CD }
    { FE } { 03CE }
  }
  {
    { AE }
    { D2 }
  }
%</iso88597>
%    \end{macrocode}
%
%    \begin{macrocode}
%<*iso88598>
\@@_declare_eight_bit_encoding:nnnn { iso88598 } { 308 }
  {
    { AA } { 00D7 }
    { BA } { 00F7 }
    { DF } { 2017 }
    { E0 } { 05D0 }
    { E1 } { 05D1 }
    { E2 } { 05D2 }
    { E3 } { 05D3 }
    { E4 } { 05D4 }
    { E5 } { 05D5 }
    { E6 } { 05D6 }
    { E7 } { 05D7 }
    { E8 } { 05D8 }
    { E9 } { 05D9 }
    { EA } { 05DA }
    { EB } { 05DB }
    { EC } { 05DC }
    { ED } { 05DD }
    { EE } { 05DE }
    { EF } { 05DF }
    { F0 } { 05E0 }
    { F1 } { 05E1 }
    { F2 } { 05E2 }
    { F3 } { 05E3 }
    { F4 } { 05E4 }
    { F5 } { 05E5 }
    { F6 } { 05E6 }
    { F7 } { 05E7 }
    { F8 } { 05E8 }
    { F9 } { 05E9 }
    { FA } { 05EA }
    { FD } { 200E }
    { FE } { 200F }
  }
  {
    { A1 }
    { BF }
    { C0 }
    { C1 }
    { C2 }
    { C3 }
    { C4 }
    { C5 }
    { C6 }
    { C7 }
    { C8 }
    { C9 }
    { CA }
    { CB }
    { CC }
    { CD }
    { CE }
    { CF }
    { D0 }
    { D1 }
    { D2 }
    { D3 }
    { D4 }
    { D5 }
    { D6 }
    { D7 }
    { D8 }
    { D9 }
    { DA }
    { DB }
    { DC }
    { DD }
    { DE }
    { FB }
    { FC }
  }
%</iso88598>
%    \end{macrocode}
%
%    \begin{macrocode}
%<*iso88599>
\@@_declare_eight_bit_encoding:nnnn { iso88599 } { 352 }
  {
    { D0 } { 011E }
    { DD } { 0130 }
    { DE } { 015E }
    { F0 } { 011F }
    { FD } { 0131 }
    { FE } { 015F }
  }
  {
  }
%</iso88599>
%    \end{macrocode}
%
%    \begin{macrocode}
%<*iso885910>
\@@_declare_eight_bit_encoding:nnnn { iso885910 } { 383 }
  {
    { A1 } { 0104 }
    { A2 } { 0112 }
    { A3 } { 0122 }
    { A4 } { 012A }
    { A5 } { 0128 }
    { A6 } { 0136 }
    { A8 } { 013B }
    { A9 } { 0110 }
    { AA } { 0160 }
    { AB } { 0166 }
    { AC } { 017D }
    { AE } { 016A }
    { AF } { 014A }
    { B1 } { 0105 }
    { B2 } { 0113 }
    { B3 } { 0123 }
    { B4 } { 012B }
    { B5 } { 0129 }
    { B6 } { 0137 }
    { B8 } { 013C }
    { B9 } { 0111 }
    { BA } { 0161 }
    { BB } { 0167 }
    { BC } { 017E }
    { BD } { 2015 }
    { BE } { 016B }
    { BF } { 014B }
    { C0 } { 0100 }
    { C7 } { 012E }
    { C8 } { 010C }
    { CA } { 0118 }
    { CC } { 0116 }
    { D1 } { 0145 }
    { D2 } { 014C }
    { D7 } { 0168 }
    { D9 } { 0172 }
    { E0 } { 0101 }
    { E7 } { 012F }
    { E8 } { 010D }
    { EA } { 0119 }
    { EC } { 0117 }
    { F1 } { 0146 }
    { F2 } { 014D }
    { F7 } { 0169 }
    { F9 } { 0173 }
    { FF } { 0138 }
  }
  {
  }
%</iso885910>
%    \end{macrocode}
%
%    \begin{macrocode}
%<*iso885911>
\@@_declare_eight_bit_encoding:nnnn { iso885911 } { 369 }
  {
    { A1 } { 0E01 }
    { A2 } { 0E02 }
    { A3 } { 0E03 }
    { A4 } { 0E04 }
    { A5 } { 0E05 }
    { A6 } { 0E06 }
    { A7 } { 0E07 }
    { A8 } { 0E08 }
    { A9 } { 0E09 }
    { AA } { 0E0A }
    { AB } { 0E0B }
    { AC } { 0E0C }
    { AD } { 0E0D }
    { AE } { 0E0E }
    { AF } { 0E0F }
    { B0 } { 0E10 }
    { B1 } { 0E11 }
    { B2 } { 0E12 }
    { B3 } { 0E13 }
    { B4 } { 0E14 }
    { B5 } { 0E15 }
    { B6 } { 0E16 }
    { B7 } { 0E17 }
    { B8 } { 0E18 }
    { B9 } { 0E19 }
    { BA } { 0E1A }
    { BB } { 0E1B }
    { BC } { 0E1C }
    { BD } { 0E1D }
    { BE } { 0E1E }
    { BF } { 0E1F }
    { C0 } { 0E20 }
    { C1 } { 0E21 }
    { C2 } { 0E22 }
    { C3 } { 0E23 }
    { C4 } { 0E24 }
    { C5 } { 0E25 }
    { C6 } { 0E26 }
    { C7 } { 0E27 }
    { C8 } { 0E28 }
    { C9 } { 0E29 }
    { CA } { 0E2A }
    { CB } { 0E2B }
    { CC } { 0E2C }
    { CD } { 0E2D }
    { CE } { 0E2E }
    { CF } { 0E2F }
    { D0 } { 0E30 }
    { D1 } { 0E31 }
    { D2 } { 0E32 }
    { D3 } { 0E33 }
    { D4 } { 0E34 }
    { D5 } { 0E35 }
    { D6 } { 0E36 }
    { D7 } { 0E37 }
    { D8 } { 0E38 }
    { D9 } { 0E39 }
    { DA } { 0E3A }
    { DF } { 0E3F }
    { E0 } { 0E40 }
    { E1 } { 0E41 }
    { E2 } { 0E42 }
    { E3 } { 0E43 }
    { E4 } { 0E44 }
    { E5 } { 0E45 }
    { E6 } { 0E46 }
    { E7 } { 0E47 }
    { E8 } { 0E48 }
    { E9 } { 0E49 }
    { EA } { 0E4A }
    { EB } { 0E4B }
    { EC } { 0E4C }
    { ED } { 0E4D }
    { EE } { 0E4E }
    { EF } { 0E4F }
    { F0 } { 0E50 }
    { F1 } { 0E51 }
    { F2 } { 0E52 }
    { F3 } { 0E53 }
    { F4 } { 0E54 }
    { F5 } { 0E55 }
    { F6 } { 0E56 }
    { F7 } { 0E57 }
    { F8 } { 0E58 }
    { F9 } { 0E59 }
    { FA } { 0E5A }
    { FB } { 0E5B }
  }
  {
    { DB }
    { DC }
    { DD }
    { DE }
  }
%</iso885911>
%    \end{macrocode}
%
%    \begin{macrocode}
%<*iso885913>
\@@_declare_eight_bit_encoding:nnnn { iso885913 } { 399 }
  {
    { A1 } { 201D }
    { A5 } { 201E }
    { A8 } { 00D8 }
    { AA } { 0156 }
    { AF } { 00C6 }
    { B4 } { 201C }
    { B8 } { 00F8 }
    { BA } { 0157 }
    { BF } { 00E6 }
    { C0 } { 0104 }
    { C1 } { 012E }
    { C2 } { 0100 }
    { C3 } { 0106 }
    { C6 } { 0118 }
    { C7 } { 0112 }
    { C8 } { 010C }
    { CA } { 0179 }
    { CB } { 0116 }
    { CC } { 0122 }
    { CD } { 0136 }
    { CE } { 012A }
    { CF } { 013B }
    { D0 } { 0160 }
    { D1 } { 0143 }
    { D2 } { 0145 }
    { D4 } { 014C }
    { D8 } { 0172 }
    { D9 } { 0141 }
    { DA } { 015A }
    { DB } { 016A }
    { DD } { 017B }
    { DE } { 017D }
    { E0 } { 0105 }
    { E1 } { 012F }
    { E2 } { 0101 }
    { E3 } { 0107 }
    { E6 } { 0119 }
    { E7 } { 0113 }
    { E8 } { 010D }
    { EA } { 017A }
    { EB } { 0117 }
    { EC } { 0123 }
    { ED } { 0137 }
    { EE } { 012B }
    { EF } { 013C }
    { F0 } { 0161 }
    { F1 } { 0144 }
    { F2 } { 0146 }
    { F4 } { 014D }
    { F8 } { 0173 }
    { F9 } { 0142 }
    { FA } { 015B }
    { FB } { 016B }
    { FD } { 017C }
    { FE } { 017E }
    { FF } { 2019 }
  }
  {
  }
%</iso885913>
%    \end{macrocode}
%
%    \begin{macrocode}
%<*iso885914>
\@@_declare_eight_bit_encoding:nnnn { iso885914 } { 529 }
  {
    { A1 } { 1E02 }
    { A2 } { 1E03 }
    { A4 } { 010A }
    { A5 } { 010B }
    { A6 } { 1E0A }
    { A8 } { 1E80 }
    { AA } { 1E82 }
    { AB } { 1E0B }
    { AC } { 1EF2 }
    { AF } { 0178 }
    { B0 } { 1E1E }
    { B1 } { 1E1F }
    { B2 } { 0120 }
    { B3 } { 0121 }
    { B4 } { 1E40 }
    { B5 } { 1E41 }
    { B7 } { 1E56 }
    { B8 } { 1E81 }
    { B9 } { 1E57 }
    { BA } { 1E83 }
    { BB } { 1E60 }
    { BC } { 1EF3 }
    { BD } { 1E84 }
    { BE } { 1E85 }
    { BF } { 1E61 }
    { D0 } { 0174 }
    { D7 } { 1E6A }
    { DE } { 0176 }
    { F0 } { 0175 }
    { F7 } { 1E6B }
    { FE } { 0177 }
  }
  {
  }
%</iso885914>
%    \end{macrocode}
%
%    \begin{macrocode}
%<*iso885915>
\@@_declare_eight_bit_encoding:nnnn { iso885915 } { 383 }
  {
    { A4 } { 20AC }
    { A6 } { 0160 }
    { A8 } { 0161 }
    { B4 } { 017D }
    { B8 } { 017E }
    { BC } { 0152 }
    { BD } { 0153 }
    { BE } { 0178 }
  }
  {
  }
%</iso885915>
%    \end{macrocode}
%
%    \begin{macrocode}
%<*iso885916>
\@@_declare_eight_bit_encoding:nnnn { iso885916 } { 558 }
  {
    { A1 } { 0104 }
    { A2 } { 0105 }
    { A3 } { 0141 }
    { A4 } { 20AC }
    { A5 } { 201E }
    { A6 } { 0160 }
    { A8 } { 0161 }
    { AA } { 0218 }
    { AC } { 0179 }
    { AE } { 017A }
    { AF } { 017B }
    { B2 } { 010C }
    { B3 } { 0142 }
    { B4 } { 017D }
    { B5 } { 201D }
    { B8 } { 017E }
    { B9 } { 010D }
    { BA } { 0219 }
    { BC } { 0152 }
    { BD } { 0153 }
    { BE } { 0178 }
    { BF } { 017C }
    { C3 } { 0102 }
    { C5 } { 0106 }
    { D0 } { 0110 }
    { D1 } { 0143 }
    { D5 } { 0150 }
    { D7 } { 015A }
    { D8 } { 0170 }
    { DD } { 0118 }
    { DE } { 021A }
    { E3 } { 0103 }
    { E5 } { 0107 }
    { F0 } { 0111 }
    { F1 } { 0144 }
    { F5 } { 0151 }
    { F7 } { 015B }
    { F8 } { 0171 }
    { FD } { 0119 }
    { FE } { 021B }
  }
  {
  }
%</iso885916>
%    \end{macrocode}
%
% \end{implementation}
%
% \PrintIndex