1 files changed, 587 insertions, 0 deletions
diff --git a/dotfiles/system/.zsh/modules/Test/D07multibyte.ztst b/dotfiles/system/.zsh/modules/Test/D07multibyte.ztst
new file mode 100644
index 0000000..e203153
--- /dev/null
+++ b/dotfiles/system/.zsh/modules/Test/D07multibyte.ztst
@@ -0,0 +1,587 @@
+%prep
+
+# Find a UTF-8 locale.
+  setopt multibyte
+# Don't let LC_* override our choice of locale.
+  unset -m LC_\*
+  mb_ok=
+  langs=(en_{US,GB}.{UTF-,utf}8 en.UTF-8
+	 $(locale -a 2>/dev/null | egrep 'utf8|UTF-8'))
+  for LANG in $langs; do
+    if [[ é = ? ]]; then
+      mb_ok=1
+      break;
+    fi
+  done
+  if [[ -z $mb_ok ]]; then
+    ZTST_unimplemented="no UTF-8 locale or multibyte mode is not implemented"
+  else
+    print -u $ZTST_fd Testing multibyte with locale $LANG
+    mkdir multibyte.tmp && cd multibyte.tmp
+  fi
+
+%test
+
+  a=ténébreux
+  for i in {1..9}; do
+      print ${a[i]}
+      for j in {$i..9}; do
+	  print $i $j ${a[i,j]} ${a[-j,-i]}
+      done
+  done
+0:Basic indexing with multibyte characters
+>t
+>1 1 t x
+>1 2 té ux
+>1 3 tén eux
+>1 4 téné reux
+>1 5 ténéb breux
+>1 6 ténébr ébreux
+>1 7 ténébre nébreux
+>1 8 ténébreu énébreux
+>1 9 ténébreux ténébreux
+>é
+>2 2 é u
+>2 3 én eu
+>2 4 éné reu
+>2 5 énéb breu
+>2 6 énébr ébreu
+>2 7 énébre nébreu
+>2 8 énébreu énébreu
+>2 9 énébreux ténébreu
+>n
+>3 3 n e
+>3 4 né re
+>3 5 néb bre
+>3 6 nébr ébre
+>3 7 nébre nébre
+>3 8 nébreu énébre
+>3 9 nébreux ténébre
+>é
+>4 4 é r
+>4 5 éb br
+>4 6 ébr ébr
+>4 7 ébre nébr
+>4 8 ébreu énébr
+>4 9 ébreux ténébr
+>b
+>5 5 b b
+>5 6 br éb
+>5 7 bre néb
+>5 8 breu énéb
+>5 9 breux ténéb
+>r
+>6 6 r é
+>6 7 re né
+>6 8 reu éné
+>6 9 reux téné
+>e
+>7 7 e n
+>7 8 eu én
+>7 9 eux tén
+>u
+>8 8 u é
+>8 9 ux té
+>x
+>9 9 x t
+
+  s=é
+  print A${s[-2]}A B${s[-1]}B C${s[0]}C D${s[1]}D E${s[2]}E
+0:Out of range subscripts with multibyte characters
+>AA BéB CC DéD EE
+
+  print ${a[(i)é]} ${a[(I)é]} ${a[${a[(i)é]},${a[(I)é]}]}
+0:Reverse indexing with multibyte characters
+>2 4 éné
+
+  print ${a[(r)én,(r)éb]}
+0:Subscript searching with multibyte characters
+>énéb
+
+  print ${a[(rb:1:)é,-1]}
+  print ${a[(rb:2:)é,-1]}
+  print ${a[(rb:3:)é,-1]}
+  print ${a[(rb:4:)é,-1]}
+  print ${a[(rb:5:)é,-1]}
+0:Subscript searching with initial offset
+>énébreux
+>énébreux
+>ébreux
+>ébreux
+>
+
+  print ${a[(rn:1:)é,-1]}
+  print ${a[(rn:2:)é,-1]}
+  print ${a[(rn:3:)é,-1]}
+0:Subscript searching with count
+>énébreux
+>ébreux
+>
+
+  print ${a[(R)én,(R)éb]}
+0:Backward subscript searching with multibyte characters
+>énéb
+
+# Starting offsets with (R) seem to be so strange as to be hardly
+# worth testing.
+
+  setopt extendedglob
+  [[ $a = (#b)t(én)(éb)reux ]] || print "Failed to match." >&2
+  for i in {1..${#match}}; do
+    print $match[i] $mbegin[i] $mend[i] ${a[$mbegin[i],$mend[i]]}
+  done
+0:Multibyte offsets in pattern tests
+>én 2 3 én
+>éb 4 5 éb
+
+  b=${(U)a}
+  print $b
+  print ${(L)b}
+  desdichado="Je suis le $a, le veuf, l'inconsolé"
+  print ${(C)desdichado}
+  lxiv="l'état c'est moi"
+  print ${(C)lxiv}
+0:Case modification of multibyte strings
+>TÉNÉBREUX
+>ténébreux
+>Je Suis Le Ténébreux, Le Veuf, L'Inconsolé
+>L'État C'Est Moi
+
+  array=(ølaf ødd øpened án encyclopædia)
+  barray=(${(U)array})
+  print $barray
+  print ${(L)barray}
+  print ${(C)array}
+  print ${(C)barray}
+0:Case modification of arrays with multibyte strings
+>ØLAF ØDD ØPENED ÁN ENCYCLOPÆDIA
+>ølaf ødd øpened án encyclopædia
+>Ølaf Ødd Øpened Án Encyclopædia
+>Ølaf Ødd Øpened Án Encyclopædia
+
+  print $(( ##¥ ))
+  pound=£
+  print $(( #pound ))
+  alpha=α
+  print $(( ##α )) $(( #alpha ))
+0:Conversion to Unicode in mathematical expressions
+>165
+>163
+>945 945
+
+  unsetopt posix_identifiers
+  expr='hähä=3 || exit 1; print $hähä'
+  eval $expr
+  setopt posix_identifiers
+  (eval $expr)
+1:POSIX_IDENTIFIERS option
+>3
+?(eval):1: command not found: hähä=3
+
+  foo="Ølaf«Ødd«øpénëd«ån«àpple"
+  print -l ${(s.«.)foo}
+  ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος."
+  print -l ${=ioh}
+  print ${(w)#ioh}
+0:Splitting with multibyte characters
+>Ølaf
+>Ødd
+>øpénëd
+>ån
+>àpple
+>Ἐν
+>ἀρχῇ
+>ἦν
+>ὁ
+>λόγος,
+>καὶ
+>ὁ
+>λόγος
+>ἦν
+>πρὸς
+>τὸν
+>θεόν,
+>καὶ
+>θεὸς
+>ἦν
+>ὁ
+>λόγος.
+>17
+
+  read -d £ one
+  read -d £ two
+  print $one
+  print $two
+0:read with multibyte delimiter
+<first£second£
+>first
+>second
+
+  (IFS=«
+  read -d » -A array
+  print -l $array)
+0:read -A with multibyte IFS
+<dominus«illuminatio«mea»ignored
+>dominus
+>illuminatio
+>mea
+
+  read -k2 -u0 twochars
+  print $twochars
+0:read multibyte characters
+<«»ignored
+>«»
+
+  read -q -u0 mb
+  print $?
+0:multibyte character makes read -q return false
+<«
+>1
+
+  # See if the system grokks first-century Greek...
+  ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος."
+  for (( i = 1; i <= ${#ioh}; i++ )); do
+    # FC3 doesn't recognise ῇ (U+1FC7: Greek small letter eta with
+    # perispomeni and ypogegrammeni, of course) as a lower case character.
+    if [[ $ioh[i] != [[:lower:]] && $i != 7 ]]; then
+      for tp in upper space punct invalid; do
+        if [[ $tp = invalid || $ioh[i] = [[:${tp}:]] ]]; then
+          print "$i: $tp"
+	  break
+	fi
+      done
+    fi
+  done
+0:isw* functions on non-ASCII wide characters
+>1: upper
+>3: space
+>8: space
+>11: space
+>13: space
+>19: punct
+>20: space
+>24: space
+>26: space
+>32: space
+>35: space
+>40: space
+>44: space
+>49: punct
+>50: space
+>54: space
+>59: space
+>62: space
+>64: space
+>70: punct
+
+  ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος"
+  print ${ioh#[[:alpha:]]##}
+  print ${ioh##[[:alpha:]]##}
+  print ${ioh%[[:alpha:]]##}
+  print ${ioh%%[[:alpha:]]##}
+  print ${(S)ioh#λ*ς}
+  print ${(S)ioh##λ*ς}
+  print ${(S)ioh%θ*ς}
+  print ${(S)ioh%%θ*ς}
+0:Parameter #, ##, %, %% with multibyte characters
+>ν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος
+> ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος
+>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγο
+>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ 
+>Ἐν ἀρχῇ ἦν ὁ , καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος
+>Ἐν ἀρχῇ ἦν ὁ 
+>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ  ἦν ὁ λόγος
+>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ 
+
+  a="1ë34ë6"
+  print ${(BEN)a#*4}
+  print ${(BEN)a##*ë}
+  print ${(BEN)a%4*}
+  print ${(BEN)a%%ë*}
+  print ${(SBEN)a#ë3}
+  print ${(SBEN)a%4ë}
+0:Flags B, E, N and S in ${...#...} and ${...%...}
+>1 5 4
+>1 6 5
+>4 7 3
+>2 7 5
+>2 4 2
+>4 6 2
+
+  foo=(κατέβην χθὲς εἰς Πειραιᾶ)
+  print ${(l.3..¥.r.3..£.)foo}
+  print ${(l.4..¥.r.2..£.)foo}
+  print ${(l.5..¥.r.1..£.)foo}
+  print ${(l.4..¥..«.r.4..£..».)foo}
+  print ${(l.4..¥..Σωκράτης.r.4..£..Γλαύκωνος.)foo}
+0:simultaneous left and right padding
+>κατέβη ¥χθὲς£ ¥¥εἰς£ Πειραι
+>¥κατέβ ¥¥χθὲς ¥¥¥εἰς ¥Πειρα
+>¥¥κατέ ¥¥¥χθὲ ¥¥¥¥εἰ ¥¥Πειρ
+>«κατέβην ¥«χθὲς»£ ¥¥«εἰς»£ «Πειραιᾶ
+>ςκατέβην ηςχθὲςΓλ τηςεἰςΓλ ςΠειραιᾶ
+# er... yeah, that looks right...
+
+  foo=picobarn
+  print ${foo:s£bar£rod£:s¥rod¥stick¥}
+0:Delimiters in modifiers
+>picostickn
+
+# TODO: if we get paired multibyte bracket delimiters to work
+# (as Emacs does, the smug so-and-so), the following should change.
+  foo=bar
+  print ${(r£5££X£)foo}
+  print ${(l«10««Y««HI«)foo}
+0:Delimiters in parameter flags
+>barXX
+>YYYYYHIbar
+
+  printf "%4.3s\n" főobar
+0:Multibyte characters in printf widths
+> főo
+
+# We ask for case-insensitive sorting here (and supply upper case
+# characters) so that we exercise the logic in the shell that lowers the
+# case of the string for case-insensitive sorting.
+  print -oi HÛH HÔH HÎH HÊH HÂH
+  (LC_ALL=C; print -oi HAH HUH HEH HÉH HÈH)
+0:Multibyte characters in print sorting
+>HÂH HÊH HÎH HÔH HÛH
+>HAH HEH HUH HÈH HÉH
+
+# These are control characters in Unicode, so don't show up.
+# We just want to check they're not being treated as tokens.
+  for x in {128..150}; do
+     print ${(#)x}
+  done | while read line; do
+    print ${#line} $(( #line ))
+  done
+0:evaluated character number with multibyte characters
+>1 128
+>1 129
+>1 130
+>1 131
+>1 132
+>1 133
+>1 134
+>1 135
+>1 136
+>1 137
+>1 138
+>1 139
+>1 140
+>1 141
+>1 142
+>1 143
+>1 144
+>1 145
+>1 146
+>1 147
+>1 148
+>1 149
+>1 150
+
+  touch ngs1txt ngs2txt ngs10txt ngs20txt ngs100txt ngs200txt
+  setopt numericglobsort
+  print -l ngs*
+0:NUMERIC_GLOB_SORT option in UTF-8 locale
+>ngs1txt
+>ngs2txt
+>ngs10txt
+>ngs20txt
+>ngs100txt
+>ngs200txt
+
+# Not strictly multibyte, but gives us a well-defined locale for testing.
+  foo=$'X\xc0Y\x07Z\x7fT'
+  print -r ${(q)foo}
+0:Backslash-quoting of unprintable/invalid characters uses $'...'
+>X$'\300'Y$'\a'Z$'\177'T
+
+# This also isn't strictly multibyte and is here to reduce the
+# likelihood of a "cannot do character set conversion" error.
+  (print $'\u00e9') 2>&1 | read
+  if [[ $REPLY != é ]]; then
+    print "warning: your system can't do simple Unicode conversion." >&$ZTST_fd
+    print "Check you have a correctly installed iconv library." >&$ZTST_fd
+    # cheat
+    repeat 4 print OK
+  else
+    testfn() { (LC_ALL=C; print $'\u00e9') }
+    repeat 4 testfn 2>&1 | while read line; do
+      if [[ $line = *"character not in range"* ]]; then
+        print OK
+      elif [[ $line = "?" ]]; then
+        print OK
+      else
+        print Failed: no error message and no question mark
+      fi
+    done
+  fi
+  true
+0:error handling in Unicode quoting
+>OK
+>OK
+>OK
+>OK
+
+  tmp1='glob/\(\)Ą/*'
+  [[ glob/'()Ą'/foo == $~tmp1 ]] && print "Matched against $tmp1"
+  tmp1='glob/\(\)Ā/*'
+  [[ glob/'()Ā'/bar == $~tmp1 ]] && print "Matched against $tmp1"
+0:Backslashes and metafied characters in patterns
+>Matched against glob/()Ą/*
+>Matched against glob/()Ā/*
+
+  mkdir 梶浦由記 'Пётр Ильич Чайковский'
+  (cd 梶浦由記; print ${${(%):-%~}:t})
+  (cd 'Пётр Ильич Чайковский'; print ${${(%):-%~}:t})
+0:Metafied characters in prompt expansion
+>梶浦由記
+>Пётр Ильич Чайковский
+
+  (
+  setopt nonomatch
+  tmp1=Ą
+  tmpA=(Ą 'Пётр Ильич Чайковский' 梶浦由記)
+  print ${tmp1} ${(%)tmp1} ${(%%)tmp1}
+  print ${#tmp1} ${#${(%)tmp1}} ${#${(%%)tmp1}}
+  print ${tmpA}
+  print ${(%)tmpA}
+  print ${(%%)tmpA}
+  )
+0:More metafied characters in prompt expansion
+>Ą Ą Ą
+>1 1 1
+>Ą Пётр Ильич Чайковский 梶浦由記
+>Ą Пётр Ильич Чайковский 梶浦由記
+>Ą Пётр Ильич Чайковский 梶浦由記
+
+  setopt cbases
+  print $'\xc5' | read
+  print $(( [#16] #REPLY ))
+0:read passes through invalid multibyte characters
+>0xC5
+
+  word=abcま
+  word[-1]=
+  print $word
+  word=abcま
+  word[-2]=
+  print $word
+  word=abcま
+  word[4]=d
+  print $word
+  word=abcま
+  word[3]=not_c
+  print $word
+0:assignment with negative indices
+>abc
+>abま
+>abcd
+>abnot_cま
+
+  # The following doesn't necessarily need UTF-8, but this gives
+  # us the full effect --- if we parse this wrongly the \xe9
+  # in combination with the tokenized input afterwards looks like a
+  # valid UTF-8 character.  But it isn't.
+  print $'$\xe9#``' >test_bad_param
+  (setopt nonomatch
+  . ./test_bad_param)
+127:Invalid parameter name with following tokenized input
+?./test_bad_param:1: command not found: $\M-i#
+
+  lines=$'one\tＺＳＨ\tthree\nfour\tfive\tsix'
+  print -X8 -r -- $lines
+0:Tab expansion with extra-wide characters
+>one     ＺＳＨ  three
+>four    five    six
+# This doesn't look aligned in my editor because actually the characters
+# aren't quite double width, but the arithmetic is correct.
+# It appears just to be an effect of the font.
+
+  () {
+     emulate -L zsh
+     setopt errreturn
+     local cdpath=(.)
+     mkdir ホ
+     cd ホ
+     cd ..
+     cd ./ホ
+     cd ..
+  }
+0:cd with special characters
+
+  test_array=(
+  '[[ \xcc = \xcc ]]'
+  '[[ \xcc != \xcd ]]'
+  '[[ \xcc != \ucc ]]'
+  '[[ \ucc = \ucc ]]'
+  '[[ \ucc = [\ucc] ]]'
+  '[[ \xcc != [\ucc] ]]'
+  # Not clear how useful the following is...
+  '[[ \xcc = [\xcc] ]]'
+  )
+  for test in $test_array; do
+    if ! eval ${(g::)test} ; then
+      print -rl "Test $test failed" >&2
+    fi
+  done
+0:Invalid characters in pattern matching
+
+  [[ $'\xe3' == [[:INCOMPLETE:]] ]] || print fail 1
+  [[ $'\xe3\x83' == [[:INCOMPLETE:]][[:INVALID:]] ]] || print fail 2
+  [[ $'\xe3\x83\x9b' != [[:INCOMPLETE:][:INVALID:]] ]] || print fail 3
+  [[ $'\xe3\x83\x9b' = ? ]] || print fail 4
+0:Testing incomplete and invalid multibyte character components
+
+  print -r -- ${(q+):-ホ}
+  foo='She said "ホ".  I said "You can'\''t '\''ホ'\'' me!'
+  print -r -- ${(q+)foo}
+0:${(q+)...} with printable multibyte characters
+>ホ
+>'She said "ホ".  I said "You can'\''t '\''ホ'\'' me!'
+
+#  This will silently succeed if zsh/parameter isn't available
+  (zmodload zsh/parameter >/dev/null 2>&1
+  f() {
+    : $(:)
+    "↓"
+  }
+  : $functions)
+0:Multibyte handling of functions parameter
+
+# c1=U+0104 (Ą) and c2=U+0120 (Ġ) are chosen so that
+#   u1 = utf8(c1) = c4 84  <  u2 = utf8(c2) = c4 a0
+#   metafy(u1) = c4 83 a4  >  metafy(u2) = c4 83 80
+# in both UTF-8 and ASCII collations (the latter is used in macOS
+# and some versions of BSDs).
+  local -a names=( $'\u0104' $'\u0120' )
+  print -o $names
+  mkdir -p colltest
+  cd colltest
+  touch $names
+  print ?
+0:Sorting of metafied characters
+>Ą Ġ
+>Ą Ġ
+
+  printf '%q%q\n' 你你
+0:printf %q and quotestring and general metafy / token madness
+>你你
+
+# This test is kept last as it introduces an additional
+# dependency on the system regex library.
+  if zmodload zsh/regex 2>/dev/null; then
+    [[ $'\ua0' =~ '^.$' ]] && print OK
+    [[ $'\ua0' =~ $'^\ua0$' ]] && print OK
+    [[ $'\ua0'X =~ '^X$' ]] || print OK
+  else
+    ZTST_skip="regexp library not found."
+  fi
+0:Ensure no confusion on metafied input to regex module
+>OK
+>OK
+>OK
+F:A failure here may indicate the system regex library does not
+F:support character sets outside the portable 7-bit range.