% \iffalse meta-comment
%
%% File: l3unicode.dtx
%
% Copyright (C) 2018-2025 The LaTeX Project
%
% It may be distributed and/or modified under the conditions of the
% LaTeX Project Public License (LPPL), either version 1.3c of this
% license or (at your option) any later version.  The latest version
% of this license is in the file
%
%    https://www.latex-project.org/lppl.txt
%
% This file is part of the "l3kernel bundle" (The Work in LPPL)
% and all files in that bundle must be distributed together.
%
% -----------------------------------------------------------------------
%
% The development version of the bundle can be found at
%
%    https://github.com/latex3/latex3
%
% for those people who are interested.
%
%<*driver>
\documentclass[full,kernel]{l3doc}
\begin{document}
  \DocInput{\jobname.dtx}
\end{document}
%</driver>
% \fi
%
% \title{^^A
%   The \pkg{l3unicode} module\\ Unicode support functions^^A
% }
%
% \author{^^A
%  The \LaTeX{} Project\thanks
%    {^^A
%      E-mail:
%        \href{mailto:latex-team@latex-project.org}
%          {latex-team@latex-project.org}^^A
%    }^^A
% }
%
% \date{Released 2025-01-18}
%
% \maketitle
%
% \begin{documentation}
%
% This module provides Unicode-specific functions along with loading data
% from a range of Unicode Consortium files. Most of the code here is
% internal, but there are a small set of public functions. These work with
% Unicode \meta{codepoints} and are designed to give usable results with
% both Unicode-aware and $8$-bit engines.
%
% \begin{function}[EXP, added = 2022-10-09, updated = 2022-11-09]
%   {\codepoint_generate:nn}
%   \begin{syntax}
%      \cs{codepoint_generate:nn} \Arg{codepoint} \Arg{catcode}
%   \end{syntax}
%   Generates one or more character tokens representing the \meta{codepoint}.
%   With Unicode engines, exactly one character token will be generated, and
%   this will have the \meta{catcode} specified as the second argument:
%   \begin{itemize}
%     \item $1$ (begin group)
%     \item $2$ (end group)
%     \item $3$ (math toggle)
%     \item $4$ (alignment)
%     \item $6$ (parameter)
%     \item $7$ (math superscript)
%     \item $8$ (math subscript)
%     \item $10$ (space)
%     \item $11$ (letter)
%     \item $12$ (other)
%     \item $13$ (active)
%   \end{itemize}
%   For $8$-bit engines, between one and four character tokens will be
%   produced: these will be the bytes of the UTF-8 representation of the
%   \meta{codepoint}. For all codepoints outside of the classical ASCII
%   range, the generated character tokens will be active (category code
%   $13$); for codepoints in the ASCII range, the given \meta{catcode}
%   will be used. To allow the result of this function to be used
%   inside an expansion context, the result is protected by \cs{exp_not:n}.
%
%   \begin{texnote}
%     Users of (u)p\TeX{} note that these engines are treated as $8$-bit in
%     this context. In particular, for up\TeX{}, irrespective of the
%     \tn{kcatcode} of the \meta{codepoint}, any value outside the ASCII range
%     will result in a series of active bytes being generated.
%   \end{texnote}
% \end{function}
%
% \begin{function}[EXP, added = 2022-10-09]
%   {\codepoint_str_generate:n}
%   \begin{syntax}
%      \cs{codepoint_str_generate:n} \Arg{codepoint}
%   \end{syntax}
%   Generates one or more character tokens representing the \meta{codepoint}.
%   With Unicode engines, exactly one character token will be generated.
%   For $8$-bit engines, between one and four character tokens will be
%   produced: these will be the bytes of the UTF-8 representation of the
%   \meta{codepoint}. All of the generated character tokens will be of
%   category code $12$, except any spaces (codepoint $32$), which will be
%   category code $10$.
% \end{function}
%
% \begin{function}[added = 2023-06-19, EXP]{\codepoint_to_category:n}
%   \begin{syntax}
%     \cs{codepoint_to_category:n} \Arg{codepoint}
%   \end{syntax}
%   Expands to the Unicode general category identifier of the \meta{codepoint}.
%   The general category identifier is a string made up of two letter
%   characters, the first uppercase and the second lowercase. The uppercase
%   letters divide codepoints into broader groups, which are then refined
%   by the lowercase letter. For example, codepoints representing letters
%   all have identifiers starting \texttt{L}, for example \texttt{Lu}
%   (uppercase letter), \texttt{Lt} (titlecase letter), \emph{etc.}
%   Full details are available in the documentation provided by the Unicode
%   Consortium: see
%   \url{https://www.unicode.org/reports/tr44/#General_Category_Values}
% \end{function}
%
% \begin{function}[added = 2022-10-09, EXP]{\codepoint_to_nfd:n}
%   \begin{syntax}
%     \cs{codepoint_to_nfd:n} \Arg{codepoint}
%   \end{syntax}
%   Converts the \meta{codepoint} to the Unicode Normalization
%   Form Canonical Decomposition. The generated character(s) will have
%   the current category code as they would if typed in directly for Unicode
%   engines; for $8$-bit engines, active characters are used for all codepoints
%   outside of the ASCII range.
% \end{function}
%
% \end{documentation}
%
% \begin{implementation}
%
% \section{\pkg{l3unicode} implementation}
%
%    \begin{macrocode}
%<*package>
%    \end{macrocode}
%
%    \begin{macrocode}
%<@@=codepoint>
%    \end{macrocode}
%
% \subsection{User functions}
%
% \begin{macro}[EXP]{\codepoint_str_generate:n}
% \begin{macro}[EXP]{\@@_str_generate:nnnn}
% \begin{macro}[EXP]{\codepoint_generate:nn}
% \begin{macro}[EXP]{\@@_generate:nnnn}
% \begin{macro}[EXP]{\@@_generate:n}
%   Conversion of a codepoint to a character (Unicode engines) or to one
%   or more bytes ($8$-bit engines) is required. For loading the data,
%   all that is needed is the form which creates strings: these are outside
%   the group as they will also be used when looking up data in the hash
%   table storage at point-of-use. Later, we will also need functions that
%   can generate character tokens for document use: those are defined below,
%   in the data recovery setup.
%    \begin{macrocode}
\sys_if_engine_opentype:TF
  {
    \cs_new:Npn \codepoint_str_generate:n #1
      {
        \int_compare:nNnTF {#1} = { `\  }
          { ~ }
          { \char_generate:nn {#1} { 12 } }
      }
    \cs_new:Npn \codepoint_generate:nn #1#2
      {
        \int_compare:nNnTF {#1} = { `\  }
          { ~ }
          {
            \__kernel_exp_not:w \exp_after:wN \exp_after:wN \exp_after:wN
              { \char_generate:nn {#1} {#2} }
          }
      }
  }
  {
    \cs_new:Npn \codepoint_str_generate:n #1
      {
        \int_compare:nNnTF {#1} = { `\  }
          { ~ }
          {
            \use:e
              {
                \exp_not:N \@@_str_generate:nnnn
                  \__kernel_codepoint_to_bytes:n {#1}
              }
          }
      }
    \cs_new:Npn \@@_str_generate:nnnn #1#2#3#4
      {
        \char_generate:nn {#1} { 12 }
        \tl_if_blank:nF {#2}
          {
            \char_generate:nn {#2} { 12 }
            \tl_if_blank:nF {#3}
              {
                \char_generate:nn {#3} { 12 }
                \tl_if_blank:nF {#4}
                  { \char_generate:nn {#4} { 12 } }
              }
          }
      }
    \cs_new:Npn \codepoint_generate:nn #1#2
      {
        \int_compare:nNnTF {#1} = { `\  }
          { ~ }
          {
            \int_compare:nNnTF {#1} < { "80 }
              {
                \__kernel_exp_not:w \exp_after:wN \exp_after:wN \exp_after:wN
                  { \char_generate:nn {#1} {#2} }
              }
              {
                \use:e
                  {
                    \exp_not:N \@@_generate:nnnn
                      \__kernel_codepoint_to_bytes:n {#1}
                  }
              }
          }
      }
    \cs_new:Npn \@@_generate:nnnn #1#2#3#4
      {
        \__kernel_exp_not:w \exp_after:wN
          {
            \tex_expanded:D
              {
                \@@_generate:n {#1}
                \@@_generate:n {#2}
                \tl_if_blank:nF {#3}
                  {
                    \@@_generate:n {#3}
                    \tl_if_blank:nF {#4}
                      { \@@_generate:n {#4} }
                  }
              }
          }
      }
    \cs_new:Npn \@@_generate:n #1
      {
        \__kernel_exp_not:w \exp_after:wN \exp_after:wN \exp_after:wN
          { \char_generate:nn {#1} { 13 } }
      }
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
%
% \begin{macro}[EXP]{\__kernel_codepoint_to_bytes:n}
% \begin{macro}[EXP]{\@@_to_bytes_auxi:n}
% \begin{macro}[EXP]{\@@_to_bytes_auxii:Nnn}
% \begin{macro}[EXP]{\@@_to_bytes_auxiii:n}
% \begin{macro}[EXP]
%   {
%     \@@_to_bytes_outputi:nw   ,
%     \@@_to_bytes_outputii:nw  ,
%     \@@_to_bytes_outputiii:nw ,
%     \@@_to_bytes_outputiv:nw
%   }
% \begin{macro}[EXP]
%   {\@@_to_bytes_output:nnn, \@@_to_bytes_output:fnn}
% \begin{macro}[EXP]{\@@_to_bytes_end:}
%   This code converts a codepoint into the correct UTF-8 representation.
%   In terms of the algorithm itself, see
%   \url{https://en.wikipedia.org/wiki/UTF-8} for the octet pattern.
%    \begin{macrocode}
\cs_new:Npn \__kernel_codepoint_to_bytes:n #1
  {
    \exp_args:Nf \@@_to_bytes_auxi:n
      { \int_eval:n {#1} }
  }
\cs_new:Npn \@@_to_bytes_auxi:n #1
  {
    \if_int_compare:w #1 > "80 \exp_stop_f:
      \if_int_compare:w #1 < "800 \exp_stop_f:
        \@@_to_bytes_outputi:nw
          { \@@_to_bytes_auxii:Nnn C {#1} { 64 } }
        \@@_to_bytes_outputii:nw
          { \@@_to_bytes_auxiii:n {#1} }
      \else:
        \if_int_compare:w #1 < "10000 \exp_stop_f:
          \@@_to_bytes_outputi:nw
            { \@@_to_bytes_auxii:Nnn E {#1} { 64 * 64 } }
          \@@_to_bytes_outputii:nw
            {
              \@@_to_bytes_auxiii:n
                { \int_div_truncate:nn {#1} { 64 } }
            }
          \@@_to_bytes_outputiii:nw
            { \@@_to_bytes_auxiii:n {#1} }
        \else:
          \@@_to_bytes_outputi:nw
            {
              \@@_to_bytes_auxii:Nnn F
                {#1} { 64 * 64 * 64 }
            }
          \@@_to_bytes_outputii:nw
            {
              \@@_to_bytes_auxiii:n
                { \int_div_truncate:nn {#1} { 64 * 64 } }
            }
          \@@_to_bytes_outputiii:nw
            {
              \@@_to_bytes_auxiii:n
                { \int_div_truncate:nn {#1} { 64 } }
            }
          \@@_to_bytes_outputiv:nw
            { \@@_to_bytes_auxiii:n {#1} }
        \fi:
      \fi:
    \else:
      \@@_to_bytes_outputi:nw {#1}
    \fi:
    \@@_to_bytes_end: { } { } { } { }
  }
\cs_new:Npn \@@_to_bytes_auxii:Nnn #1#2#3
  {  "#10 + \int_div_truncate:nn {#2} {#3} }
\cs_new:Npn \@@_to_bytes_auxiii:n #1
  { \int_mod:nn {#1} { 64 } + 128 }
\cs_new:Npn \@@_to_bytes_outputi:nw
  #1 #2 \@@_to_bytes_end: #3
  { \@@_to_bytes_output:fnn { \int_eval:n {#1} } { } {#2} }
\cs_new:Npn \@@_to_bytes_outputii:nw
  #1 #2 \@@_to_bytes_end: #3#4
  { \@@_to_bytes_output:fnn { \int_eval:n {#1} } { {#3} } {#2} }
\cs_new:Npn \@@_to_bytes_outputiii:nw
  #1 #2 \@@_to_bytes_end: #3#4#5
  {
    \@@_to_bytes_output:fnn
      { \int_eval:n {#1} } { {#3} {#4} } {#2}
  }
\cs_new:Npn \@@_to_bytes_outputiv:nw
  #1 #2 \@@_to_bytes_end: #3#4#5#6
  {
    \@@_to_bytes_output:fnn
      { \int_eval:n {#1} } { {#3} {#4} {#5} } {#2}
  }
\cs_new:Npn \@@_to_bytes_output:nnn #1#2#3
  {
    #3
    \@@_to_bytes_end: #2 {#1}
  }
\cs_generate_variant:Nn \@@_to_bytes_output:nnn { f }
\cs_new:Npn \@@_to_bytes_end: { }
%    \end{macrocode}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
%
% \begin{macro}[EXP]{\codepoint_to_category:n}
%   Get the value and convert back to the string.
%    \begin{macrocode}
\cs_new:Npn \codepoint_to_category:n #1
  {
    \cs:w
      c_@@_category_
      \tex_romannumeral:D 
        \__kernel_codepoint_data:nn { category } {#1}
      _str
    \cs_end:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}[EXP]{\codepoint_to_nfd:n, \@@_to_nfd:n}
% \begin{macro}[EXP]{\@@_to_nfd:nn}
% \begin{macro}[EXP]{\@@_to_nfd:nnn}
% \begin{macro}[EXP]{\@@_to_nfd:nnnn}
%   Converted to NFD is a potentially-recursive process: the key is to
%   check if we get the input codepoint back again. As far as possible,
%   we use the same path for all engines.
%    \begin{macrocode}
\cs_new:Npn \codepoint_to_nfd:n #1
  { \exp_args:Ne \@@_to_nfd:n { \int_eval:n {#1} } }
\cs_new:Npn \@@_to_nfd:n #1
  { \@@_to_nfd:nn {#1} { \char_value_catcode:n {#1} } }
\sys_if_engine_opentype:F
  {
    \cs_gset:Npn \@@_to_nfd:n #1
      {
        \int_compare:nNnTF {#1} > { "80 }
          { \@@_to_nfd:nn {#1} { 12 } }
          { \@@_to_nfd:nn {#1} { \char_value_catcode:n {#1} } }
      }
  }
\cs_new:Npn \@@_to_nfd:nn #1#2
  {
    \exp_args:Ne \@@_to_nfd:nnn
      { \@@_nfd:n {#1} } {#1} {#2}
  }
\cs_new:Npn \@@_to_nfd:nnn #1#2#3 { \@@_to_nfd:nnnn #1 {#2} {#3} }
\cs_new:Npn \@@_to_nfd:nnnn #1#2#3#4
  {
    \int_compare:nNnTF {#1} = {#3}
      { \codepoint_generate:nn {#1} {#4} }
      {
        \@@_to_nfd:nn {#1} {#4}
        \tl_if_blank:nF {#2}
          { \@@_to_nfd:nn {#2} {#4} }
      }
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
%
% \subsection{Data loader}
%
% Text operations requires data from the Unicode Consortium. Data read into
% Unicode engine formats is at best a small part of what we need, so there
% is a loader here to set  up the appropriate data structures.
%
% Where we need data for most or all of the Unicode range, we use the two-stage
% table approach recommended by the Unicode Consortium and demonstrated in a
% model implementation in Python in
% \url{https://www.strchr.com/multi-stage_tables}. This approach uses the
% \texttt{intarray} (\texttt{fontdimen}-based) data type as it is fast for
% random access and avoids significant hash table usage. In contrast, where
% only a small subset of codepoints are required, storage as macros is
% preferable. There is also some consideration of the effort needed to load
% data: see for example the grapheme breaking information, which would be 
% problematic to convert into a two-stage table but which can be used with
% reasonable performance in a small number of comma lists (at the cost that
% breaking at higher codepoint Hangul characters will be slightly slow).
%
% \begin{variable}{\c_@@_block_size_int}
%   Choosing the block size for the blocks in the two-stage approach is
%   non-trivial: depending on the data stored, the optimal size for
%   memory usage will vary. At the same time, for us there is also the
%   question of load-time: larger blocks require longer comma lists
%   as intermediates, so are slower. As this is going to be needed
%   to use the data, we set it up outside of the group for clarity.
%    \begin{macrocode}
\int_const:Nn \c_@@_block_size_int { 64 }
%    \end{macrocode}
% \end{variable}
%
% Parsing the data files can be the same way for all engines, but where they
% are stored as character tokens, the construction method depends on whether
% they are Unicode or $8$-bit internally. Parsing is therefore done by common
% functions, with some data storage using engine-specific auxiliaries.
%
% As only the data needs to remain at the end of this process, everything
% is set up inside a group. The only thing that is outside is creating a
% stream: they are global anyway and it is best to force a stream for
% all engines.
%
% \begin{variable}{\g_@@_data_ior}
%    \begin{macrocode}
\ior_new:N \g_@@_data_ior
%    \end{macrocode}
% \end{variable}
%
% We need some setup for the two-part table approach. The number of blocks we
% need will be variable, but the resulting size of the stage one table
% is predictable. For performance reasons, we therefore create the stage one 
% tables now so they can be used immediately, and will later rename them as a
% constant tables. For each two-stage table construction, we need a comma
% list to hold the partial block and a couple of integers to track where
% we are up to. To avoid burning registers, the latter are stored in macros
% and are \enquote{fake} integers. We also avoid any \texttt{new} functions,
% keeping as much as possible local.
%
% As we need both positive and negative values, case data requires one
% two-stage table for each transformation. In contrasts, general Unicode
% properties could be stored in one table with appropriate combination rules:
% that is not done at present but is likely to be added over time. Here, all
% that is needed is additional entries into the comma-list to create the
% structures.
%
% Notice that in the standard \pkg{expl3} way we are indexes position not
% offset: that does mean a little work later.
%    \begin{macrocode}
\group_begin:
  \clist_map_inline:nn
    { category , uppercase , lowercase }
    {
      \cs_set_nopar:cpn { l_@@_ #1 _block_clist } { }
      \cs_set_nopar:cpn { l_@@_ #1 _block_tl } { 1 }
      \cs_set_nopar:cpn { l_@@_ #1 _pos_tl } { 0 }
      \intarray_new:cn { g_@@_ #1 _index_intarray }
        { \int_div_truncate:nn { "110000 } \c_@@_block_size_int }
    }
%    \end{macrocode}
%  We need an integer value when matching the current block to those we have
%  already seen, and a way to track codepoints for handling ranges. Again,
%  we avoid using up registers or creating global names.
%    \begin{macrocode}
  \cs_set_nopar:Npn \l_@@_next_codepoint_fint_tl { 0 }
  \cs_set_nopar:Npn \l_@@_matched_block_tl { 0 }
%    \end{macrocode}
% For Unicode general category, there needs to be numerical representation of
% each possible value. As we need to go from string to number here, but the
% other way elsewhere, we set up fast mappings both ways, but one set local
% and the other as constants.
%    \begin{macrocode}
  \cs_set_protected:Npn \@@_data_auxi:w #1#2
    {
      \quark_if_recursion_tail_stop:n {#2}
      \cs_set_nopar:cpn { l_@@_category_ #2 _tl } {#1}
      \str_const:cn { c_@@_category_ \tex_romannumeral:D #1 _str } {#2}
      \exp_args:Ne \@@_data_auxi:w { \int_eval:n { #1 + 1 } }
    }
  \@@_data_auxi:w { 1 }
    { Lu } { Ll } { Lt } { Lm } { Lo }
    { Mn } { Me } { Mc }
    { Nd } { Nl } { No }
    { Zs } { Zl } { Zp }
    { Cc } { Cf } { Co } { Cs } { Cn }
    { Pd } { Ps } { Pe } { Pc } { Po } { Pi } { Pf }
    { Sm } { Sc } { Sk } { So }
    \q_recursion_tail
    \q_recursion_stop
%    \end{macrocode}
% Parse the main Unicode data file and pull out the NFD and case changing
% data. The NFD data is stored on using the hash table approach and can yield
% a predictable number of codepoints: one or two. We also need the case data,
% which will be modified further below. To allow for finding ranges, the
% description of the codepoint needs to be carried forward.
%    \begin{macrocode}
  \cs_set_protected:Npn \@@_data_auxi:w
    #1 ; #2 ; #3 ; #4 ; #5 ; #6 ; #7 ; #8 ; #9 ;
    {
      \tl_if_blank:nF {#6}
        {
          \tl_if_head_eq_charcode:nNF {#6}  < % >
            { \@@_data_auxii:w #1 ; #6 ~ \q_stop }
        }
      \@@_data_auxiii:w #1 ; #2 ; #3 ;
    }
  \cs_set_protected:Npn \@@_data_auxii:w #1 ; #2 ~ #3 \q_stop
    {
      \tl_const:ce
        { c_@@_nfd_ \codepoint_str_generate:n {"#1} _tl }
        {
          {"#2}
          { \tl_if_blank:nF {#3} {"#3} }
        }
    }
%    \end{macrocode}
% The category data needs to be converted from a string to the numerical
% equivalent: a simple operation.
% The case data is going to be stored as an offset from the parent character,
% rather than an absolute value. We therefore deal with that plus the situation
% where a codepoint has no mapping data in one shot.
%    \begin{macrocode}
  \cs_set_protected:Npn \@@_data_auxiii:w
    #1 ; #2 ; #3 ; #4 ; #5 ; #6 ; #7 ; #8 ; #9 ~ \q_stop
    {
      \use:e
        {
          \@@_data_auxiv:w
            #1 ; #2 ;
            \@@_data_category:n {#3} ;
            \@@_data_offset:nn {#1} {#7} ;
            \@@_data_offset:nn {#1} {#8} ;
            #9;
        }
    }
  \cs_set:Npn \@@_data_category:n #1
    { \use:c { l_@@_category_ #1 _tl } }
  \cs_set:Npn \@@_data_offset:nn #1#2
    {
      \tl_if_blank:nTF {#2}
        { 0 }
        { \int_eval:n { "#2 - "#1 } }
    }
%    \end{macrocode}
% To deal with ranges, we track the position of the next codepoint expected.
% If there is a gap, we deal with that separately: it could be a range or
% an unused part of the Unicode space. As such, we deal with the current
% codepoint here whether or not there is range to fill in. Upper- and 
% lowercase data go into the two-stage table, any titlecase exception is
% just stored in a macro. The data for the codepoint is added to the current
% block, and if that is now complete we move on to save the block. The
% case exceptions are all stored as codepoints, with a fixed number of
% balanced text as we know that there are never more than three.
%    \begin{macrocode}
  \cs_set_protected:Npn \@@_data_auxiv:w #1 ; #2 ; #3 ; #4 ; #5 ; #6 ;
    {
      \int_compare:nNnT {"#1} > \l_@@_next_codepoint_fint_tl
        {
          \@@_data_auxv:nnnnw {#1} {#3} {#4} {#5}
            #2 Last> \q_stop
        }
      \@@_add:nn { category } {#3}
      \@@_add:nn { uppercase } {#4}
      \@@_add:nn { lowercase } {#5}
      \int_compare:nNnF {#4} = { \@@_data_offset:nn {#1} {#6} }
        {
          \tl_const:ce
            { c_@@_titlecase_ \codepoint_str_generate:n {"#1} _tl }
            { {"#6} { } { } }
        }
      \tl_set:Ne \l_@@_next_codepoint_fint_tl
        { \int_eval:n { "#1 + 1 } }
    }
  \cs_set_protected:Npn \@@_add:nn #1#2
    {
      \clist_put_right:cn { l_@@_ #1 _block_clist } {#2}
      \int_compare:nNnT { \clist_count:c { l_@@_ #1 _block_clist } }
        = \c_@@_block_size_int
        { \@@_save_blocks:nn {#1} { 1 } }
    }
%    \end{macrocode}
%  Distinguish between a range and a gap, and pass on the appropriate value(s).
%  The general category for unassigned characters is \texttt{Cn}, so we
%  find the correct value once and then use that.
%    \begin{macrocode}
  \cs_set_protected:Npe \@@_data_auxv:nnnnw #1#2#3#4#5 Last> #6 \q_stop
    {
      \exp_not:N \tl_if_blank:nTF {#6}
        {
          \exp_not:N \@@_range:nnn {#1} { category }
            { \exp_not:V \l_@@_category_Cn_tl }
          \exp_not:N \@@_range:nnn {#1} { uppercase } { 0 }
          \exp_not:N \@@_range:nnn {#1} { lowercase } { 0 }
        }
        {
          \exp_not:N \@@_range:nnn {#1} { category } {#2}
          \exp_not:N \@@_range:nnn {#1} { uppercase } {#3}
          \exp_not:N \@@_range:nnn {#1} { lowercase } {#4}
        }      
    }
%    \end{macrocode}
%  Calculated the length of the range and the space remaining in the current
%  block.
%    \begin{macrocode}
  \cs_set_protected:Npn \@@_range:nnn #1
    {
      \exp_args:Nf \@@_range_aux:nnn
        { \int_eval:n { "#1 - \l_@@_next_codepoint_fint_tl } }
    }
  \cs_set_protected:Npn \@@_range_aux:nnn #1#2
    {
      \exp_args:Nf \@@_range:nnnn
        {
          \int_min:nn
            {#1}
            {
              \c_@@_block_size_int 
              - \clist_count:c { l_@@_ #2 _block_clist }
            }
        }
        {#1} {#2}
    }
%    \end{macrocode}
%   Here we want to do three things: add to and possibly complete the current
%   block, add complete blocks quickly, then finish up the range in a final
%   open block. We need to avoid as far as possible avoid dealing with every
%   single codepoint, so the middle step is optimised.
%    \begin{macrocode}
  \cs_set_protected:Npn \@@_range:nnnn #1#2#3#4
    {
      \prg_replicate:nn {#1}
        { \clist_put_right:cn { l_@@_ #3 _block_clist } {#4} }
    \int_compare:nNnT { \clist_count:c { l_@@_ #3 _block_clist } }
      = \c_@@_block_size_int
      { \@@_save_blocks:nn {#3} { 1 } }
    \int_compare:nNnF
      { \int_div_truncate:nn { #2 - #1 } \c_@@_block_size_int } = 0
      {
        \tl_set:ce { l_@@_ #3 _block_clist }
          {
            \exp_args:NNe \use:nn \use_none:n
              { \prg_replicate:nn { \c_@@_block_size_int } { , #4 } }
          }
        \@@_save_blocks:nn {#3}
          { \int_div_truncate:nn { (#2 - #1) } \c_@@_block_size_int }
      }
    \prg_replicate:nn
      { \int_mod:nn { #2 - #1 } \c_@@_block_size_int }
      { \clist_put_right:ce { l_@@_ #3 _block_clist } {#4} }
    }
%    \end{macrocode}
%   To allow rapid comparison, each completed block is stored locally as a
%   comma list: once all of the blocks have been created, they are converted
%   into an \texttt{intarray} in one step. The aim here is to check the current
%   block against those we've already used, and either match to an existing
%   block or save a new block.
%    \begin{macrocode}
  \cs_set_protected:Npn \@@_save_blocks:nn #1#2
    {
      \tl_set_eq:Nc \l_@@_matched_block_tl { l_@@_ #1 _block_tl }
      \int_step_inline:nn { \tl_use:c { l_@@_ #1 _block_tl } - 1 }
        {
          \tl_if_eq:ccT { l_@@_ #1 _block_clist }
            { l_@@_ #1 _block_ ##1 _clist }
            { \tl_set:Nn \l_@@_matched_block_tl {##1} }
        }
      \int_compare:nNnT
        { \tl_use:c { l_@@_ #1 _block_tl } } = \l_@@_matched_block_tl
          {
            \clist_set_eq:cc
              {
                l_@@_ #1 _block_
                \tl_use:c { l_@@_ #1 _block_tl } _clist
              }
              { l_@@_ #1 _block_clist }
            \tl_set:ce { l_@@_ #1 _block_tl }
              { \int_eval:n { \tl_use:c { l_@@_ #1 _block_tl } + 1 } }
          }
%    \end{macrocode}
% Here, we avoid \cs{prg_replicate:nn} as the number of tokens generated would be
% high: that shows in the format dump (although \TeX{} recovers memory during
% the subsequent runs).
%    \begin{macrocode}
        \int_step_inline:nnn
          { \tl_use:c { l_@@_ #1 _pos_tl } + 1 }
          { \tl_use:c { l_@@_ #1 _pos_tl } + #2 }
          {
            \exp_args:Nc \__kernel_intarray_gset:Nnn
              { g_@@_ #1 _index_intarray }
              {##1}
              \l_@@_matched_block_tl
          }
        \tl_set:ce { l_@@_ #1 _pos_tl }
          { \int_eval:n { \tl_use:c { l_@@_ #1 _pos_tl } + #2 } } 
      \clist_clear:c { l_@@_ #1 _block_clist }
    }
%    \end{macrocode}
% Close out the final block, rename the first stage table, then combine all
% of the block comma-lists into one large second-stage table with offsets.
% As we use an index not an offset, there is a little back-and-forward to do.
%    \begin{macrocode}
  \cs_set_protected:Npn \@@_finalise_blocks:
    {
      \clist_map_inline:nn { category , uppercase , lowercase }
        {
          \@@_range:nnn { 110000 } {##1} { 0 }
          \@@_finalise_blocks:n {##1}
        }
    }
  \cs_set_protected:Npn \@@_finalise_blocks:n #1
    {
      \cs_gset_eq:cc { c_@@_ #1 _index_intarray } { g_@@_ #1 _index_intarray }
      \cs_undefine:c { g_@@_ #1 _index_intarray }
      \intarray_new:cn { g_@@_ #1 _blocks_intarray }
        { ( \tl_use:c { l_@@_ #1 _block_tl } - 1 ) * \c_@@_block_size_int }
      \int_step_inline:nn { \tl_use:c { l_@@_ #1 _block_tl } - 1 }
        {
          \exp_args:Nv \@@_finalise_blocks:nnn
            { l_@@_ #1 _block_ ##1 _clist }
            {##1} {#1}
        }
      \cs_gset_eq:cc { c_@@_ #1 _blocks_intarray }
        { g_@@_ #1 _blocks_intarray }
      \cs_undefine:c { g_@@_ #1 _blocks_intarray }
    }
  \cs_set_protected:Npn \@@_finalise_blocks:nnn #1#2#3
    {
      \exp_args:Nnf \@@_finalise_blocks:nnnw { 1 }
        { \int_eval:n { ( #2 - 1 ) * \c_@@_block_size_int } }
        {#3}
        #1 , \q_recursion_tail , \q_recursion_stop
    }
  \cs_set_protected:Npn \@@_finalise_blocks:nnnw #1#2#3#4 ,
    {
      \quark_if_recursion_tail_stop:n {#4}
      \intarray_gset:cnn { g_@@_ #3 _blocks_intarray }
        { #1 + #2 }
        {#4}
      \exp_args:Nf \@@_finalise_blocks:nnnw
        { \int_eval:n { #1 + 1 } } {#2} {#3}
    }
%    \end{macrocode}
%  With the setup done, read the main data file: it's easiest to do that as
%  a token list with spaces retained.
%    \begin{macrocode}
  \ior_open:Nn \g_@@_data_ior { UnicodeData.txt }
  \group_begin:
    \char_set_catcode_space:n { `\  }%
    \ior_map_variable:NNn \g_@@_data_ior \l_@@_tmpa_tl
      {%
        \if_meaning:w \l_@@_tmpa_tl \c_space_tl
          \exp_after:wN \ior_map_break:
        \fi:
        \exp_after:wN \@@_data_auxi:w \l_@@_tmpa_tl \q_stop
      }%
    \@@_finalise_blocks:
  \group_end:
\group_end:
%    \end{macrocode}
%
% \begin{macro}[EXP]{\__kernel_codepoint_data:nn}
% \begin{macro}[EXP]{\@@_data:nnn}
%   Recover data from a two-stage table: entirely generic as this applies to
%   all tables (as we use the same block size for all of them). Notice that
%   as we use indices not offsets we have to shuffle out-by-one issues. This
%   function is needed \emph{before} loading the special casing data, as there
%   we need to be able to check the standard case mappings.
%    \begin{macrocode}
\cs_new:Npn \__kernel_codepoint_data:nn #1#2
  {
    \exp_args:Nf \@@_data:nnn
      {
        \int_eval:n
          {
            \c_@@_block_size_int *
              (
                \intarray_item:cn { c_@@_ #1 _index_intarray }
                  {
                    \int_div_truncate:nn {#2}
                      \c_@@_block_size_int
                    + 1
                  }
                  - 1
              )
          }
      }
      {#2} {#1}
  }
\cs_new:Npn \@@_data:nnn #1#2#3
  {
    \intarray_item:cn { c_@@_ #3 _blocks_intarray }
      { #1 + \int_mod:nn {#2} \c_@@_block_size_int + 1 }
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% The other data files all use C-style comments so we have to worry about
% |#| tokens (and reading as strings). The set up for case folding is in two
% parts. For the basic (core) mappings, folding is the same as lower casing in
% most positions so only store the differences. For the more complex foldings,
% always store the result, splitting up the two or three code points in the input
% as required.
%    \begin{macrocode}
\group_begin:
  \ior_open:Nn \g_@@_data_ior { CaseFolding.txt }
  \cs_set_protected:Npn \@@_data_auxi:w #1 ;~ #2 ;~ #3 ; #4 \q_stop
    {
      \if:w \tl_head:n { #2 ? } C
        \reverse_if:N \if_int_compare:w
          \int_eval:n { \__kernel_codepoint_data:nn { lowercase } {"#1} + "#1 }
            = "#3 ~
          \tl_const:ce
            { c_@@_casefold_ \codepoint_str_generate:n {"#1} _tl }
            { {"#3} { } { } }
        \fi:
      \else:
        \if:w \tl_head:n { #2 ? } F
          \@@_data_auxii:w #1 ~ #3 ~ \q_stop
        \fi:
      \fi:
    }
%    \end{macrocode}
% Here, |#4| can have a trailing space, so we tidy up a bit at the cost of
% speed for these small number of cases it applies to.
%    \begin{macrocode}
  \cs_set_protected:Npn \@@_data_auxii:w #1 ~ #2 ~ #3 ~ #4 \q_stop
    {
      \tl_const:ce { c_@@_casefold_ \codepoint_str_generate:n {"#1} _tl }
        {
          {"#2}
          {"#3}
          { \tl_if_blank:nF {#4} { " \int_to_Hex:n {"#4} } }
        }
    }
  \ior_str_map_inline:Nn \g_@@_data_ior
    {
      \reverse_if:N \if:w \c_hash_str \tl_head:w #1 \c_hash_str \q_stop
        \@@_data_auxi:w #1 \q_stop
      \fi:
    }
  \ior_close:N \g_@@_data_ior
%    \end{macrocode}
% For upper- and lowercasing special situations, there is a bit more to
% do as we also have titlecasing to consider, plus we need to stop part-way
% through the file.
%    \begin{macrocode}
  \ior_open:Nn \g_@@_data_ior { SpecialCasing.txt }
  \cs_set_protected:Npn \@@_data_auxi:w
    #1 ;~ #2 ;~ #3 ;~ #4 ; #5 \q_stop
    {
      \use:n { \@@_data_auxii:w #1 ~ lower ~ #2 ~ } ~ \q_stop
      \use:n { \@@_data_auxii:w #1 ~ upper ~ #4 ~ } ~ \q_stop
      \str_if_eq:nnF {#3} {#4}
        { \use:n { \@@_data_auxii:w #1 ~ title ~ #3 ~ } ~ \q_stop }
    }
  \cs_set_protected:Npn \@@_data_auxii:w
    #1 ~ #2 ~ #3 ~ #4 ~ #5 \q_stop
    {
      \tl_if_empty:nF {#4}
        {
          \tl_const:ce { c_@@_ #2 case_ \codepoint_str_generate:n {"#1} _tl }
            {
              {"#3}
              {"#4}
              { \tl_if_blank:nF {#5} {"#5} }
            }
        }
    }
  \ior_str_map_inline:Nn \g_@@_data_ior
    {
      \str_if_eq:eeTF { \tl_head:w #1 \c_hash_str \q_stop } { \c_hash_str }
        {
          \str_if_eq:eeT
            {#1}
            { \c_hash_str \c_space_tl Conditional~Mappings }
            { \ior_map_break: }
        }
        { \@@_data_auxi:w #1 \q_stop }
    }
  \ior_close:N \g_@@_data_ior
\group_end:
%    \end{macrocode}
%
% \begin{macro}[EXP]{\__kernel_codepoint_case:nn}
% \begin{macro}[EXP]{\@@_case:nnn}
% \begin{macro}[EXP]
%   {\@@_uppercase:n, \@@_lowercase:n, \@@_titlecase:n, \@@_casefold:n}
% \begin{macro}[EXP]{\@@_case:nn}
%   With the core data files loaded, there is now a need to provide access to
%   this information for other modules. That is done here such that case
%   folding can also be covered. At this level, all that needs to be returned
%   is the
%    \begin{macrocode}
\cs_new:Npn \__kernel_codepoint_case:nn #1#2
  {
    \exp_args:Ne \@@_case:nnn
      { \codepoint_str_generate:n {#2} } {#1} {#2}
  }
\cs_new:Npn \@@_case:nnn #1#2#3
  {
    \cs_if_exist:cTF { c_@@_ #2 _ #1 _tl }
      {
        \tl_use:c
          { c_@@_ #2 _ #1 _tl }
      }
      { \use:c { @@_ #2 :n } {#3} }
  }
\cs_new:Npn \@@_uppercase:n { \@@_case:nn { uppercase } }
\cs_new:Npn \@@_lowercase:n { \@@_case:nn { lowercase } }
\cs_new:Npn \@@_titlecase:n { \@@_case:nn { uppercase } }
\cs_new:Npn \@@_casefold:n  { \@@_case:nn { lowercase } }
\cs_new:Npn \@@_case:nn #1#2
  {
    { \int_eval:n { \__kernel_codepoint_data:nn {#1} {#2} + #2 } }
    { }
    { }
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
%
% \begin{macro}[EXP]{\@@_nfd:n}
% \begin{macro}[EXP]{\@@_nfd:nn}
%   A simple interface.
%    \begin{macrocode}
\cs_new:Npn \@@_nfd:n #1
  { \exp_args:Ne \@@_nfd:nn { \codepoint_str_generate:n {#1} } {#1} }
\cs_new:Npn \@@_nfd:nn #1#2
  {
    \tl_if_exist:cTF { c_@@_nfd_ #1 _tl }
      { \tl_use:c { c_@@_nfd_ #1 _tl } }
      { {#2} { } }
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
%    \begin{macrocode}
%<@@=text>
%    \end{macrocode}
%
%  Read the Unicode grapheme data. This is quite easy to handle and we only need
%  codepoints, not characters, so there is no need to worry about the engine in use.
%  As reading as a string is most convenient, we have to do some work to remove
%  spaces: the hardest part of the entire process!
%    \begin{macrocode}
\ior_new:N \g_@@_data_ior
\group_begin:
  \ior_open:Nn \g_@@_data_ior { GraphemeBreakProperty.txt }
  \cs_set_nopar:Npn \l_@@_tmpa_str { }
  \cs_set_nopar:Npn \l_@@_tmpb_str { }
  \cs_set_protected:Npn \@@_data_auxi:w #1 ;~ #2 ~ #3 \q_stop
    {
      \str_if_eq:VnF \l_@@_tmpb_str {#2}
        {
          \str_if_empty:NF \l_@@_tmpb_str
            {
              \clist_const:ce { c_@@_grapheme_ \l_@@_tmpb_str _clist }
                { \exp_after:wN \use_none:n \l_@@_tmpa_str }
              \cs_set_nopar:Npn \l_@@_tmpa_str { }
            }
          \cs_set_nopar:Npn \l_@@_tmpb_str {#2}
        }
      \@@_data_auxii:w #1 .. #1 .. #1 \q_stop
    }
  \cs_set_protected:Npn \@@_data_auxii:w #1 .. #2 .. #3 \q_stop
    {
      \cs_set_nopar:Npe \l_@@_tmpa_str
        {
          \l_@@_tmpa_str ,
          \tl_trim_spaces:n {#1} .. \tl_trim_spaces:n {#2}
        }
    }
  \ior_str_map_inline:Nn \g_@@_data_ior
    {
      \str_if_eq:eeF { \tl_head:w #1 \c_hash_str \q_stop } { \c_hash_str }
        {
          \tl_if_blank:nF {#1}
            { \@@_data_auxi:w #1 \q_stop }
        }
    }
  \ior_close:N \g_@@_data_ior
\group_end:    
%    \end{macrocode}
%
%    \begin{macrocode}
%</package>
%    \end{macrocode}
%
% \end{implementation}
%
% \PrintIndex