diff options
Diffstat (limited to 'dotfiles/system/.zsh/modules/Test/D07multibyte.ztst')
| -rw-r--r-- | dotfiles/system/.zsh/modules/Test/D07multibyte.ztst | 587 |
1 files changed, 587 insertions, 0 deletions
diff --git a/dotfiles/system/.zsh/modules/Test/D07multibyte.ztst b/dotfiles/system/.zsh/modules/Test/D07multibyte.ztst new file mode 100644 index 0000000..e203153 --- /dev/null +++ b/dotfiles/system/.zsh/modules/Test/D07multibyte.ztst @@ -0,0 +1,587 @@ +%prep + +# Find a UTF-8 locale. + setopt multibyte +# Don't let LC_* override our choice of locale. + unset -m LC_\* + mb_ok= + langs=(en_{US,GB}.{UTF-,utf}8 en.UTF-8 + $(locale -a 2>/dev/null | egrep 'utf8|UTF-8')) + for LANG in $langs; do + if [[ é = ? ]]; then + mb_ok=1 + break; + fi + done + if [[ -z $mb_ok ]]; then + ZTST_unimplemented="no UTF-8 locale or multibyte mode is not implemented" + else + print -u $ZTST_fd Testing multibyte with locale $LANG + mkdir multibyte.tmp && cd multibyte.tmp + fi + +%test + + a=ténébreux + for i in {1..9}; do + print ${a[i]} + for j in {$i..9}; do + print $i $j ${a[i,j]} ${a[-j,-i]} + done + done +0:Basic indexing with multibyte characters +>t +>1 1 t x +>1 2 té ux +>1 3 tén eux +>1 4 téné reux +>1 5 ténéb breux +>1 6 ténébr ébreux +>1 7 ténébre nébreux +>1 8 ténébreu énébreux +>1 9 ténébreux ténébreux +>é +>2 2 é u +>2 3 én eu +>2 4 éné reu +>2 5 énéb breu +>2 6 énébr ébreu +>2 7 énébre nébreu +>2 8 énébreu énébreu +>2 9 énébreux ténébreu +>n +>3 3 n e +>3 4 né re +>3 5 néb bre +>3 6 nébr ébre +>3 7 nébre nébre +>3 8 nébreu énébre +>3 9 nébreux ténébre +>é +>4 4 é r +>4 5 éb br +>4 6 ébr ébr +>4 7 ébre nébr +>4 8 ébreu énébr +>4 9 ébreux ténébr +>b +>5 5 b b +>5 6 br éb +>5 7 bre néb +>5 8 breu énéb +>5 9 breux ténéb +>r +>6 6 r é +>6 7 re né +>6 8 reu éné +>6 9 reux téné +>e +>7 7 e n +>7 8 eu én +>7 9 eux tén +>u +>8 8 u é +>8 9 ux té +>x +>9 9 x t + + s=é + print A${s[-2]}A B${s[-1]}B C${s[0]}C D${s[1]}D E${s[2]}E +0:Out of range subscripts with multibyte characters +>AA BéB CC DéD EE + + print ${a[(i)é]} ${a[(I)é]} ${a[${a[(i)é]},${a[(I)é]}]} +0:Reverse indexing with multibyte characters +>2 4 éné + + print ${a[(r)én,(r)éb]} +0:Subscript searching with multibyte characters +>énéb + + print ${a[(rb:1:)é,-1]} + print ${a[(rb:2:)é,-1]} + print ${a[(rb:3:)é,-1]} + print ${a[(rb:4:)é,-1]} + print ${a[(rb:5:)é,-1]} +0:Subscript searching with initial offset +>énébreux +>énébreux +>ébreux +>ébreux +> + + print ${a[(rn:1:)é,-1]} + print ${a[(rn:2:)é,-1]} + print ${a[(rn:3:)é,-1]} +0:Subscript searching with count +>énébreux +>ébreux +> + + print ${a[(R)én,(R)éb]} +0:Backward subscript searching with multibyte characters +>énéb + +# Starting offsets with (R) seem to be so strange as to be hardly +# worth testing. + + setopt extendedglob + [[ $a = (#b)t(én)(éb)reux ]] || print "Failed to match." >&2 + for i in {1..${#match}}; do + print $match[i] $mbegin[i] $mend[i] ${a[$mbegin[i],$mend[i]]} + done +0:Multibyte offsets in pattern tests +>én 2 3 én +>éb 4 5 éb + + b=${(U)a} + print $b + print ${(L)b} + desdichado="Je suis le $a, le veuf, l'inconsolé" + print ${(C)desdichado} + lxiv="l'état c'est moi" + print ${(C)lxiv} +0:Case modification of multibyte strings +>TÉNÉBREUX +>ténébreux +>Je Suis Le Ténébreux, Le Veuf, L'Inconsolé +>L'État C'Est Moi + + array=(ølaf ødd øpened án encyclopædia) + barray=(${(U)array}) + print $barray + print ${(L)barray} + print ${(C)array} + print ${(C)barray} +0:Case modification of arrays with multibyte strings +>ØLAF ØDD ØPENED ÁN ENCYCLOPÆDIA +>ølaf ødd øpened án encyclopædia +>Ølaf Ødd Øpened Án Encyclopædia +>Ølaf Ødd Øpened Án Encyclopædia + + print $(( ##¥ )) + pound=£ + print $(( #pound )) + alpha=α + print $(( ##α )) $(( #alpha )) +0:Conversion to Unicode in mathematical expressions +>165 +>163 +>945 945 + + unsetopt posix_identifiers + expr='hähä=3 || exit 1; print $hähä' + eval $expr + setopt posix_identifiers + (eval $expr) +1:POSIX_IDENTIFIERS option +>3 +?(eval):1: command not found: hähä=3 + + foo="Ølaf«Ødd«øpénëd«ån«àpple" + print -l ${(s.«.)foo} + ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος." + print -l ${=ioh} + print ${(w)#ioh} +0:Splitting with multibyte characters +>Ølaf +>Ødd +>øpénëd +>ån +>àpple +>Ἐν +>ἀρχῇ +>ἦν +>ὁ +>λόγος, +>καὶ +>ὁ +>λόγος +>ἦν +>πρὸς +>τὸν +>θεόν, +>καὶ +>θεὸς +>ἦν +>ὁ +>λόγος. +>17 + + read -d £ one + read -d £ two + print $one + print $two +0:read with multibyte delimiter +<first£second£ +>first +>second + + (IFS=« + read -d » -A array + print -l $array) +0:read -A with multibyte IFS +<dominus«illuminatio«mea»ignored +>dominus +>illuminatio +>mea + + read -k2 -u0 twochars + print $twochars +0:read multibyte characters +<«»ignored +>«» + + read -q -u0 mb + print $? +0:multibyte character makes read -q return false +<« +>1 + + # See if the system grokks first-century Greek... + ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος." + for (( i = 1; i <= ${#ioh}; i++ )); do + # FC3 doesn't recognise ῇ (U+1FC7: Greek small letter eta with + # perispomeni and ypogegrammeni, of course) as a lower case character. + if [[ $ioh[i] != [[:lower:]] && $i != 7 ]]; then + for tp in upper space punct invalid; do + if [[ $tp = invalid || $ioh[i] = [[:${tp}:]] ]]; then + print "$i: $tp" + break + fi + done + fi + done +0:isw* functions on non-ASCII wide characters +>1: upper +>3: space +>8: space +>11: space +>13: space +>19: punct +>20: space +>24: space +>26: space +>32: space +>35: space +>40: space +>44: space +>49: punct +>50: space +>54: space +>59: space +>62: space +>64: space +>70: punct + + ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος" + print ${ioh#[[:alpha:]]##} + print ${ioh##[[:alpha:]]##} + print ${ioh%[[:alpha:]]##} + print ${ioh%%[[:alpha:]]##} + print ${(S)ioh#λ*ς} + print ${(S)ioh##λ*ς} + print ${(S)ioh%θ*ς} + print ${(S)ioh%%θ*ς} +0:Parameter #, ##, %, %% with multibyte characters +>ν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος +> ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος +>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγο +>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ +>Ἐν ἀρχῇ ἦν ὁ , καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος +>Ἐν ἀρχῇ ἦν ὁ +>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ ἦν ὁ λόγος +>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ + + a="1ë34ë6" + print ${(BEN)a#*4} + print ${(BEN)a##*ë} + print ${(BEN)a%4*} + print ${(BEN)a%%ë*} + print ${(SBEN)a#ë3} + print ${(SBEN)a%4ë} +0:Flags B, E, N and S in ${...#...} and ${...%...} +>1 5 4 +>1 6 5 +>4 7 3 +>2 7 5 +>2 4 2 +>4 6 2 + + foo=(κατέβην χθὲς εἰς Πειραιᾶ) + print ${(l.3..¥.r.3..£.)foo} + print ${(l.4..¥.r.2..£.)foo} + print ${(l.5..¥.r.1..£.)foo} + print ${(l.4..¥..«.r.4..£..».)foo} + print ${(l.4..¥..Σωκράτης.r.4..£..Γλαύκωνος.)foo} +0:simultaneous left and right padding +>κατέβη ¥χθὲς£ ¥¥εἰς£ Πειραι +>¥κατέβ ¥¥χθὲς ¥¥¥εἰς ¥Πειρα +>¥¥κατέ ¥¥¥χθὲ ¥¥¥¥εἰ ¥¥Πειρ +>«κατέβην ¥«χθὲς»£ ¥¥«εἰς»£ «Πειραιᾶ +>ςκατέβην ηςχθὲςΓλ τηςεἰςΓλ ςΠειραιᾶ +# er... yeah, that looks right... + + foo=picobarn + print ${foo:s£bar£rod£:s¥rod¥stick¥} +0:Delimiters in modifiers +>picostickn + +# TODO: if we get paired multibyte bracket delimiters to work +# (as Emacs does, the smug so-and-so), the following should change. + foo=bar + print ${(r£5££X£)foo} + print ${(l«10««Y««HI«)foo} +0:Delimiters in parameter flags +>barXX +>YYYYYHIbar + + printf "%4.3s\n" főobar +0:Multibyte characters in printf widths +> főo + +# We ask for case-insensitive sorting here (and supply upper case +# characters) so that we exercise the logic in the shell that lowers the +# case of the string for case-insensitive sorting. + print -oi HÛH HÔH HÎH HÊH HÂH + (LC_ALL=C; print -oi HAH HUH HEH HÉH HÈH) +0:Multibyte characters in print sorting +>HÂH HÊH HÎH HÔH HÛH +>HAH HEH HUH HÈH HÉH + +# These are control characters in Unicode, so don't show up. +# We just want to check they're not being treated as tokens. + for x in {128..150}; do + print ${(#)x} + done | while read line; do + print ${#line} $(( #line )) + done +0:evaluated character number with multibyte characters +>1 128 +>1 129 +>1 130 +>1 131 +>1 132 +>1 133 +>1 134 +>1 135 +>1 136 +>1 137 +>1 138 +>1 139 +>1 140 +>1 141 +>1 142 +>1 143 +>1 144 +>1 145 +>1 146 +>1 147 +>1 148 +>1 149 +>1 150 + + touch ngs1txt ngs2txt ngs10txt ngs20txt ngs100txt ngs200txt + setopt numericglobsort + print -l ngs* +0:NUMERIC_GLOB_SORT option in UTF-8 locale +>ngs1txt +>ngs2txt +>ngs10txt +>ngs20txt +>ngs100txt +>ngs200txt + +# Not strictly multibyte, but gives us a well-defined locale for testing. + foo=$'X\xc0Y\x07Z\x7fT' + print -r ${(q)foo} +0:Backslash-quoting of unprintable/invalid characters uses $'...' +>X$'\300'Y$'\a'Z$'\177'T + +# This also isn't strictly multibyte and is here to reduce the +# likelihood of a "cannot do character set conversion" error. + (print $'\u00e9') 2>&1 | read + if [[ $REPLY != é ]]; then + print "warning: your system can't do simple Unicode conversion." >&$ZTST_fd + print "Check you have a correctly installed iconv library." >&$ZTST_fd + # cheat + repeat 4 print OK + else + testfn() { (LC_ALL=C; print $'\u00e9') } + repeat 4 testfn 2>&1 | while read line; do + if [[ $line = *"character not in range"* ]]; then + print OK + elif [[ $line = "?" ]]; then + print OK + else + print Failed: no error message and no question mark + fi + done + fi + true +0:error handling in Unicode quoting +>OK +>OK +>OK +>OK + + tmp1='glob/\(\)Ą/*' + [[ glob/'()Ą'/foo == $~tmp1 ]] && print "Matched against $tmp1" + tmp1='glob/\(\)Ā/*' + [[ glob/'()Ā'/bar == $~tmp1 ]] && print "Matched against $tmp1" +0:Backslashes and metafied characters in patterns +>Matched against glob/()Ą/* +>Matched against glob/()Ā/* + + mkdir 梶浦由記 'Пётр Ильич Чайковский' + (cd 梶浦由記; print ${${(%):-%~}:t}) + (cd 'Пётр Ильич Чайковский'; print ${${(%):-%~}:t}) +0:Metafied characters in prompt expansion +>梶浦由記 +>Пётр Ильич Чайковский + + ( + setopt nonomatch + tmp1=Ą + tmpA=(Ą 'Пётр Ильич Чайковский' 梶浦由記) + print ${tmp1} ${(%)tmp1} ${(%%)tmp1} + print ${#tmp1} ${#${(%)tmp1}} ${#${(%%)tmp1}} + print ${tmpA} + print ${(%)tmpA} + print ${(%%)tmpA} + ) +0:More metafied characters in prompt expansion +>Ą Ą Ą +>1 1 1 +>Ą Пётр Ильич Чайковский 梶浦由記 +>Ą Пётр Ильич Чайковский 梶浦由記 +>Ą Пётр Ильич Чайковский 梶浦由記 + + setopt cbases + print $'\xc5' | read + print $(( [#16] #REPLY )) +0:read passes through invalid multibyte characters +>0xC5 + + word=abcま + word[-1]= + print $word + word=abcま + word[-2]= + print $word + word=abcま + word[4]=d + print $word + word=abcま + word[3]=not_c + print $word +0:assignment with negative indices +>abc +>abま +>abcd +>abnot_cま + + # The following doesn't necessarily need UTF-8, but this gives + # us the full effect --- if we parse this wrongly the \xe9 + # in combination with the tokenized input afterwards looks like a + # valid UTF-8 character. But it isn't. + print $'$\xe9#``' >test_bad_param + (setopt nonomatch + . ./test_bad_param) +127:Invalid parameter name with following tokenized input +?./test_bad_param:1: command not found: $\M-i# + + lines=$'one\tZSH\tthree\nfour\tfive\tsix' + print -X8 -r -- $lines +0:Tab expansion with extra-wide characters +>one ZSH three +>four five six +# This doesn't look aligned in my editor because actually the characters +# aren't quite double width, but the arithmetic is correct. +# It appears just to be an effect of the font. + + () { + emulate -L zsh + setopt errreturn + local cdpath=(.) + mkdir ホ + cd ホ + cd .. + cd ./ホ + cd .. + } +0:cd with special characters + + test_array=( + '[[ \xcc = \xcc ]]' + '[[ \xcc != \xcd ]]' + '[[ \xcc != \ucc ]]' + '[[ \ucc = \ucc ]]' + '[[ \ucc = [\ucc] ]]' + '[[ \xcc != [\ucc] ]]' + # Not clear how useful the following is... + '[[ \xcc = [\xcc] ]]' + ) + for test in $test_array; do + if ! eval ${(g::)test} ; then + print -rl "Test $test failed" >&2 + fi + done +0:Invalid characters in pattern matching + + [[ $'\xe3' == [[:INCOMPLETE:]] ]] || print fail 1 + [[ $'\xe3\x83' == [[:INCOMPLETE:]][[:INVALID:]] ]] || print fail 2 + [[ $'\xe3\x83\x9b' != [[:INCOMPLETE:][:INVALID:]] ]] || print fail 3 + [[ $'\xe3\x83\x9b' = ? ]] || print fail 4 +0:Testing incomplete and invalid multibyte character components + + print -r -- ${(q+):-ホ} + foo='She said "ホ". I said "You can'\''t '\''ホ'\'' me!' + print -r -- ${(q+)foo} +0:${(q+)...} with printable multibyte characters +>ホ +>'She said "ホ". I said "You can'\''t '\''ホ'\'' me!' + +# This will silently succeed if zsh/parameter isn't available + (zmodload zsh/parameter >/dev/null 2>&1 + f() { + : $(:) + "↓" + } + : $functions) +0:Multibyte handling of functions parameter + +# c1=U+0104 (Ą) and c2=U+0120 (Ġ) are chosen so that +# u1 = utf8(c1) = c4 84 < u2 = utf8(c2) = c4 a0 +# metafy(u1) = c4 83 a4 > metafy(u2) = c4 83 80 +# in both UTF-8 and ASCII collations (the latter is used in macOS +# and some versions of BSDs). + local -a names=( $'\u0104' $'\u0120' ) + print -o $names + mkdir -p colltest + cd colltest + touch $names + print ? +0:Sorting of metafied characters +>Ą Ġ +>Ą Ġ + + printf '%q%q\n' 你你 +0:printf %q and quotestring and general metafy / token madness +>你你 + +# This test is kept last as it introduces an additional +# dependency on the system regex library. + if zmodload zsh/regex 2>/dev/null; then + [[ $'\ua0' =~ '^.$' ]] && print OK + [[ $'\ua0' =~ $'^\ua0$' ]] && print OK + [[ $'\ua0'X =~ '^X$' ]] || print OK + else + ZTST_skip="regexp library not found." + fi +0:Ensure no confusion on metafied input to regex module +>OK +>OK +>OK +F:A failure here may indicate the system regex library does not +F:support character sets outside the portable 7-bit range. |
