diff options
Diffstat (limited to 'dotfiles/system/.zsh/modules/Test/D07multibyte.ztst')
| -rw-r--r-- | dotfiles/system/.zsh/modules/Test/D07multibyte.ztst | 587 | 
1 files changed, 587 insertions, 0 deletions
| diff --git a/dotfiles/system/.zsh/modules/Test/D07multibyte.ztst b/dotfiles/system/.zsh/modules/Test/D07multibyte.ztst new file mode 100644 index 0000000..e203153 --- /dev/null +++ b/dotfiles/system/.zsh/modules/Test/D07multibyte.ztst @@ -0,0 +1,587 @@ +%prep + +# Find a UTF-8 locale. +  setopt multibyte +# Don't let LC_* override our choice of locale. +  unset -m LC_\* +  mb_ok= +  langs=(en_{US,GB}.{UTF-,utf}8 en.UTF-8 +	 $(locale -a 2>/dev/null | egrep 'utf8|UTF-8')) +  for LANG in $langs; do +    if [[ é = ? ]]; then +      mb_ok=1 +      break; +    fi +  done +  if [[ -z $mb_ok ]]; then +    ZTST_unimplemented="no UTF-8 locale or multibyte mode is not implemented" +  else +    print -u $ZTST_fd Testing multibyte with locale $LANG +    mkdir multibyte.tmp && cd multibyte.tmp +  fi + +%test + +  a=ténébreux +  for i in {1..9}; do +      print ${a[i]} +      for j in {$i..9}; do +	  print $i $j ${a[i,j]} ${a[-j,-i]} +      done +  done +0:Basic indexing with multibyte characters +>t +>1 1 t x +>1 2 té ux +>1 3 tén eux +>1 4 téné reux +>1 5 ténéb breux +>1 6 ténébr ébreux +>1 7 ténébre nébreux +>1 8 ténébreu énébreux +>1 9 ténébreux ténébreux +>é +>2 2 é u +>2 3 én eu +>2 4 éné reu +>2 5 énéb breu +>2 6 énébr ébreu +>2 7 énébre nébreu +>2 8 énébreu énébreu +>2 9 énébreux ténébreu +>n +>3 3 n e +>3 4 né re +>3 5 néb bre +>3 6 nébr ébre +>3 7 nébre nébre +>3 8 nébreu énébre +>3 9 nébreux ténébre +>é +>4 4 é r +>4 5 éb br +>4 6 ébr ébr +>4 7 ébre nébr +>4 8 ébreu énébr +>4 9 ébreux ténébr +>b +>5 5 b b +>5 6 br éb +>5 7 bre néb +>5 8 breu énéb +>5 9 breux ténéb +>r +>6 6 r é +>6 7 re né +>6 8 reu éné +>6 9 reux téné +>e +>7 7 e n +>7 8 eu én +>7 9 eux tén +>u +>8 8 u é +>8 9 ux té +>x +>9 9 x t + +  s=é +  print A${s[-2]}A B${s[-1]}B C${s[0]}C D${s[1]}D E${s[2]}E +0:Out of range subscripts with multibyte characters +>AA BéB CC DéD EE + +  print ${a[(i)é]} ${a[(I)é]} ${a[${a[(i)é]},${a[(I)é]}]} +0:Reverse indexing with multibyte characters +>2 4 éné + +  print ${a[(r)én,(r)éb]} +0:Subscript searching with multibyte characters +>énéb + +  print ${a[(rb:1:)é,-1]} +  print ${a[(rb:2:)é,-1]} +  print ${a[(rb:3:)é,-1]} +  print ${a[(rb:4:)é,-1]} +  print ${a[(rb:5:)é,-1]} +0:Subscript searching with initial offset +>énébreux +>énébreux +>ébreux +>ébreux +> + +  print ${a[(rn:1:)é,-1]} +  print ${a[(rn:2:)é,-1]} +  print ${a[(rn:3:)é,-1]} +0:Subscript searching with count +>énébreux +>ébreux +> + +  print ${a[(R)én,(R)éb]} +0:Backward subscript searching with multibyte characters +>énéb + +# Starting offsets with (R) seem to be so strange as to be hardly +# worth testing. + +  setopt extendedglob +  [[ $a = (#b)t(én)(éb)reux ]] || print "Failed to match." >&2 +  for i in {1..${#match}}; do +    print $match[i] $mbegin[i] $mend[i] ${a[$mbegin[i],$mend[i]]} +  done +0:Multibyte offsets in pattern tests +>én 2 3 én +>éb 4 5 éb + +  b=${(U)a} +  print $b +  print ${(L)b} +  desdichado="Je suis le $a, le veuf, l'inconsolé" +  print ${(C)desdichado} +  lxiv="l'état c'est moi" +  print ${(C)lxiv} +0:Case modification of multibyte strings +>TÉNÉBREUX +>ténébreux +>Je Suis Le Ténébreux, Le Veuf, L'Inconsolé +>L'État C'Est Moi + +  array=(ølaf ødd øpened án encyclopædia) +  barray=(${(U)array}) +  print $barray +  print ${(L)barray} +  print ${(C)array} +  print ${(C)barray} +0:Case modification of arrays with multibyte strings +>ØLAF ØDD ØPENED ÁN ENCYCLOPÆDIA +>ølaf ødd øpened án encyclopædia +>Ølaf Ødd Øpened Án Encyclopædia +>Ølaf Ødd Øpened Án Encyclopædia + +  print $(( ##¥ )) +  pound=£ +  print $(( #pound )) +  alpha=α +  print $(( ##α )) $(( #alpha )) +0:Conversion to Unicode in mathematical expressions +>165 +>163 +>945 945 + +  unsetopt posix_identifiers +  expr='hähä=3 || exit 1; print $hähä' +  eval $expr +  setopt posix_identifiers +  (eval $expr) +1:POSIX_IDENTIFIERS option +>3 +?(eval):1: command not found: hähä=3 + +  foo="Ølaf«Ødd«øpénëd«ån«àpple" +  print -l ${(s.«.)foo} +  ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος." +  print -l ${=ioh} +  print ${(w)#ioh} +0:Splitting with multibyte characters +>Ølaf +>Ødd +>øpénëd +>ån +>àpple +>Ἐν +>ἀρχῇ +>ἦν +>ὁ +>λόγος, +>καὶ +>ὁ +>λόγος +>ἦν +>πρὸς +>τὸν +>θεόν, +>καὶ +>θεὸς +>ἦν +>ὁ +>λόγος. +>17 + +  read -d £ one +  read -d £ two +  print $one +  print $two +0:read with multibyte delimiter +<first£second£ +>first +>second + +  (IFS=« +  read -d » -A array +  print -l $array) +0:read -A with multibyte IFS +<dominus«illuminatio«mea»ignored +>dominus +>illuminatio +>mea + +  read -k2 -u0 twochars +  print $twochars +0:read multibyte characters +<«»ignored +>«» + +  read -q -u0 mb +  print $? +0:multibyte character makes read -q return false +<« +>1 + +  # See if the system grokks first-century Greek... +  ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος." +  for (( i = 1; i <= ${#ioh}; i++ )); do +    # FC3 doesn't recognise ῇ (U+1FC7: Greek small letter eta with +    # perispomeni and ypogegrammeni, of course) as a lower case character. +    if [[ $ioh[i] != [[:lower:]] && $i != 7 ]]; then +      for tp in upper space punct invalid; do +        if [[ $tp = invalid || $ioh[i] = [[:${tp}:]] ]]; then +          print "$i: $tp" +	  break +	fi +      done +    fi +  done +0:isw* functions on non-ASCII wide characters +>1: upper +>3: space +>8: space +>11: space +>13: space +>19: punct +>20: space +>24: space +>26: space +>32: space +>35: space +>40: space +>44: space +>49: punct +>50: space +>54: space +>59: space +>62: space +>64: space +>70: punct + +  ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος" +  print ${ioh#[[:alpha:]]##} +  print ${ioh##[[:alpha:]]##} +  print ${ioh%[[:alpha:]]##} +  print ${ioh%%[[:alpha:]]##} +  print ${(S)ioh#λ*ς} +  print ${(S)ioh##λ*ς} +  print ${(S)ioh%θ*ς} +  print ${(S)ioh%%θ*ς} +0:Parameter #, ##, %, %% with multibyte characters +>ν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος +> ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος +>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγο +>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ  +>Ἐν ἀρχῇ ἦν ὁ , καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος +>Ἐν ἀρχῇ ἦν ὁ  +>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ  ἦν ὁ λόγος +>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ  + +  a="1ë34ë6" +  print ${(BEN)a#*4} +  print ${(BEN)a##*ë} +  print ${(BEN)a%4*} +  print ${(BEN)a%%ë*} +  print ${(SBEN)a#ë3} +  print ${(SBEN)a%4ë} +0:Flags B, E, N and S in ${...#...} and ${...%...} +>1 5 4 +>1 6 5 +>4 7 3 +>2 7 5 +>2 4 2 +>4 6 2 + +  foo=(κατέβην χθὲς εἰς Πειραιᾶ) +  print ${(l.3..¥.r.3..£.)foo} +  print ${(l.4..¥.r.2..£.)foo} +  print ${(l.5..¥.r.1..£.)foo} +  print ${(l.4..¥..«.r.4..£..».)foo} +  print ${(l.4..¥..Σωκράτης.r.4..£..Γλαύκωνος.)foo} +0:simultaneous left and right padding +>κατέβη ¥χθὲς£ ¥¥εἰς£ Πειραι +>¥κατέβ ¥¥χθὲς ¥¥¥εἰς ¥Πειρα +>¥¥κατέ ¥¥¥χθὲ ¥¥¥¥εἰ ¥¥Πειρ +>«κατέβην ¥«χθὲς»£ ¥¥«εἰς»£ «Πειραιᾶ +>ςκατέβην ηςχθὲςΓλ τηςεἰςΓλ ςΠειραιᾶ +# er... yeah, that looks right... + +  foo=picobarn +  print ${foo:s£bar£rod£:s¥rod¥stick¥} +0:Delimiters in modifiers +>picostickn + +# TODO: if we get paired multibyte bracket delimiters to work +# (as Emacs does, the smug so-and-so), the following should change. +  foo=bar +  print ${(r£5££X£)foo} +  print ${(l«10««Y««HI«)foo} +0:Delimiters in parameter flags +>barXX +>YYYYYHIbar + +  printf "%4.3s\n" főobar +0:Multibyte characters in printf widths +> főo + +# We ask for case-insensitive sorting here (and supply upper case +# characters) so that we exercise the logic in the shell that lowers the +# case of the string for case-insensitive sorting. +  print -oi HÛH HÔH HÎH HÊH HÂH +  (LC_ALL=C; print -oi HAH HUH HEH HÉH HÈH) +0:Multibyte characters in print sorting +>HÂH HÊH HÎH HÔH HÛH +>HAH HEH HUH HÈH HÉH + +# These are control characters in Unicode, so don't show up. +# We just want to check they're not being treated as tokens. +  for x in {128..150}; do +     print ${(#)x} +  done | while read line; do +    print ${#line} $(( #line )) +  done +0:evaluated character number with multibyte characters +>1 128 +>1 129 +>1 130 +>1 131 +>1 132 +>1 133 +>1 134 +>1 135 +>1 136 +>1 137 +>1 138 +>1 139 +>1 140 +>1 141 +>1 142 +>1 143 +>1 144 +>1 145 +>1 146 +>1 147 +>1 148 +>1 149 +>1 150 + +  touch ngs1txt ngs2txt ngs10txt ngs20txt ngs100txt ngs200txt +  setopt numericglobsort +  print -l ngs* +0:NUMERIC_GLOB_SORT option in UTF-8 locale +>ngs1txt +>ngs2txt +>ngs10txt +>ngs20txt +>ngs100txt +>ngs200txt + +# Not strictly multibyte, but gives us a well-defined locale for testing. +  foo=$'X\xc0Y\x07Z\x7fT' +  print -r ${(q)foo} +0:Backslash-quoting of unprintable/invalid characters uses $'...' +>X$'\300'Y$'\a'Z$'\177'T + +# This also isn't strictly multibyte and is here to reduce the +# likelihood of a "cannot do character set conversion" error. +  (print $'\u00e9') 2>&1 | read +  if [[ $REPLY != é ]]; then +    print "warning: your system can't do simple Unicode conversion." >&$ZTST_fd +    print "Check you have a correctly installed iconv library." >&$ZTST_fd +    # cheat +    repeat 4 print OK +  else +    testfn() { (LC_ALL=C; print $'\u00e9') } +    repeat 4 testfn 2>&1 | while read line; do +      if [[ $line = *"character not in range"* ]]; then +        print OK +      elif [[ $line = "?" ]]; then +        print OK +      else +        print Failed: no error message and no question mark +      fi +    done +  fi +  true +0:error handling in Unicode quoting +>OK +>OK +>OK +>OK + +  tmp1='glob/\(\)Ą/*' +  [[ glob/'()Ą'/foo == $~tmp1 ]] && print "Matched against $tmp1" +  tmp1='glob/\(\)Ā/*' +  [[ glob/'()Ā'/bar == $~tmp1 ]] && print "Matched against $tmp1" +0:Backslashes and metafied characters in patterns +>Matched against glob/()Ą/* +>Matched against glob/()Ā/* + +  mkdir 梶浦由記 'Пётр Ильич Чайковский' +  (cd 梶浦由記; print ${${(%):-%~}:t}) +  (cd 'Пётр Ильич Чайковский'; print ${${(%):-%~}:t}) +0:Metafied characters in prompt expansion +>梶浦由記 +>Пётр Ильич Чайковский + +  ( +  setopt nonomatch +  tmp1=Ą +  tmpA=(Ą 'Пётр Ильич Чайковский' 梶浦由記) +  print ${tmp1} ${(%)tmp1} ${(%%)tmp1} +  print ${#tmp1} ${#${(%)tmp1}} ${#${(%%)tmp1}} +  print ${tmpA} +  print ${(%)tmpA} +  print ${(%%)tmpA} +  ) +0:More metafied characters in prompt expansion +>Ą Ą Ą +>1 1 1 +>Ą Пётр Ильич Чайковский 梶浦由記 +>Ą Пётр Ильич Чайковский 梶浦由記 +>Ą Пётр Ильич Чайковский 梶浦由記 + +  setopt cbases +  print $'\xc5' | read +  print $(( [#16] #REPLY )) +0:read passes through invalid multibyte characters +>0xC5 + +  word=abcま +  word[-1]= +  print $word +  word=abcま +  word[-2]= +  print $word +  word=abcま +  word[4]=d +  print $word +  word=abcま +  word[3]=not_c +  print $word +0:assignment with negative indices +>abc +>abま +>abcd +>abnot_cま + +  # The following doesn't necessarily need UTF-8, but this gives +  # us the full effect --- if we parse this wrongly the \xe9 +  # in combination with the tokenized input afterwards looks like a +  # valid UTF-8 character.  But it isn't. +  print $'$\xe9#``' >test_bad_param +  (setopt nonomatch +  . ./test_bad_param) +127:Invalid parameter name with following tokenized input +?./test_bad_param:1: command not found: $\M-i# + +  lines=$'one\tZSH\tthree\nfour\tfive\tsix' +  print -X8 -r -- $lines +0:Tab expansion with extra-wide characters +>one     ZSH  three +>four    five    six +# This doesn't look aligned in my editor because actually the characters +# aren't quite double width, but the arithmetic is correct. +# It appears just to be an effect of the font. + +  () { +     emulate -L zsh +     setopt errreturn +     local cdpath=(.) +     mkdir ホ +     cd ホ +     cd .. +     cd ./ホ +     cd .. +  } +0:cd with special characters + +  test_array=( +  '[[ \xcc = \xcc ]]' +  '[[ \xcc != \xcd ]]' +  '[[ \xcc != \ucc ]]' +  '[[ \ucc = \ucc ]]' +  '[[ \ucc = [\ucc] ]]' +  '[[ \xcc != [\ucc] ]]' +  # Not clear how useful the following is... +  '[[ \xcc = [\xcc] ]]' +  ) +  for test in $test_array; do +    if ! eval ${(g::)test} ; then +      print -rl "Test $test failed" >&2 +    fi +  done +0:Invalid characters in pattern matching + +  [[ $'\xe3' == [[:INCOMPLETE:]] ]] || print fail 1 +  [[ $'\xe3\x83' == [[:INCOMPLETE:]][[:INVALID:]] ]] || print fail 2 +  [[ $'\xe3\x83\x9b' != [[:INCOMPLETE:][:INVALID:]] ]] || print fail 3 +  [[ $'\xe3\x83\x9b' = ? ]] || print fail 4 +0:Testing incomplete and invalid multibyte character components + +  print -r -- ${(q+):-ホ} +  foo='She said "ホ".  I said "You can'\''t '\''ホ'\'' me!' +  print -r -- ${(q+)foo} +0:${(q+)...} with printable multibyte characters +>ホ +>'She said "ホ".  I said "You can'\''t '\''ホ'\'' me!' + +#  This will silently succeed if zsh/parameter isn't available +  (zmodload zsh/parameter >/dev/null 2>&1 +  f() { +    : $(:) +    "↓" +  } +  : $functions) +0:Multibyte handling of functions parameter + +# c1=U+0104 (Ą) and c2=U+0120 (Ġ) are chosen so that +#   u1 = utf8(c1) = c4 84  <  u2 = utf8(c2) = c4 a0 +#   metafy(u1) = c4 83 a4  >  metafy(u2) = c4 83 80 +# in both UTF-8 and ASCII collations (the latter is used in macOS +# and some versions of BSDs). +  local -a names=( $'\u0104' $'\u0120' ) +  print -o $names +  mkdir -p colltest +  cd colltest +  touch $names +  print ? +0:Sorting of metafied characters +>Ą Ġ +>Ą Ġ + +  printf '%q%q\n' 你你 +0:printf %q and quotestring and general metafy / token madness +>你你 + +# This test is kept last as it introduces an additional +# dependency on the system regex library. +  if zmodload zsh/regex 2>/dev/null; then +    [[ $'\ua0' =~ '^.$' ]] && print OK +    [[ $'\ua0' =~ $'^\ua0$' ]] && print OK +    [[ $'\ua0'X =~ '^X$' ]] || print OK +  else +    ZTST_skip="regexp library not found." +  fi +0:Ensure no confusion on metafied input to regex module +>OK +>OK +>OK +F:A failure here may indicate the system regex library does not +F:support character sets outside the portable 7-bit range. | 
