diff options
Diffstat (limited to 'dotfiles/system/.zsh/modules/Test/D07multibyte.ztst')
| -rw-r--r-- | dotfiles/system/.zsh/modules/Test/D07multibyte.ztst | 587 |
1 files changed, 0 insertions, 587 deletions
diff --git a/dotfiles/system/.zsh/modules/Test/D07multibyte.ztst b/dotfiles/system/.zsh/modules/Test/D07multibyte.ztst deleted file mode 100644 index e203153..0000000 --- a/dotfiles/system/.zsh/modules/Test/D07multibyte.ztst +++ /dev/null @@ -1,587 +0,0 @@ -%prep - -# Find a UTF-8 locale. - setopt multibyte -# Don't let LC_* override our choice of locale. - unset -m LC_\* - mb_ok= - langs=(en_{US,GB}.{UTF-,utf}8 en.UTF-8 - $(locale -a 2>/dev/null | egrep 'utf8|UTF-8')) - for LANG in $langs; do - if [[ é = ? ]]; then - mb_ok=1 - break; - fi - done - if [[ -z $mb_ok ]]; then - ZTST_unimplemented="no UTF-8 locale or multibyte mode is not implemented" - else - print -u $ZTST_fd Testing multibyte with locale $LANG - mkdir multibyte.tmp && cd multibyte.tmp - fi - -%test - - a=ténébreux - for i in {1..9}; do - print ${a[i]} - for j in {$i..9}; do - print $i $j ${a[i,j]} ${a[-j,-i]} - done - done -0:Basic indexing with multibyte characters ->t ->1 1 t x ->1 2 té ux ->1 3 tén eux ->1 4 téné reux ->1 5 ténéb breux ->1 6 ténébr ébreux ->1 7 ténébre nébreux ->1 8 ténébreu énébreux ->1 9 ténébreux ténébreux ->é ->2 2 é u ->2 3 én eu ->2 4 éné reu ->2 5 énéb breu ->2 6 énébr ébreu ->2 7 énébre nébreu ->2 8 énébreu énébreu ->2 9 énébreux ténébreu ->n ->3 3 n e ->3 4 né re ->3 5 néb bre ->3 6 nébr ébre ->3 7 nébre nébre ->3 8 nébreu énébre ->3 9 nébreux ténébre ->é ->4 4 é r ->4 5 éb br ->4 6 ébr ébr ->4 7 ébre nébr ->4 8 ébreu énébr ->4 9 ébreux ténébr ->b ->5 5 b b ->5 6 br éb ->5 7 bre néb ->5 8 breu énéb ->5 9 breux ténéb ->r ->6 6 r é ->6 7 re né ->6 8 reu éné ->6 9 reux téné ->e ->7 7 e n ->7 8 eu én ->7 9 eux tén ->u ->8 8 u é ->8 9 ux té ->x ->9 9 x t - - s=é - print A${s[-2]}A B${s[-1]}B C${s[0]}C D${s[1]}D E${s[2]}E -0:Out of range subscripts with multibyte characters ->AA BéB CC DéD EE - - print ${a[(i)é]} ${a[(I)é]} ${a[${a[(i)é]},${a[(I)é]}]} -0:Reverse indexing with multibyte characters ->2 4 éné - - print ${a[(r)én,(r)éb]} -0:Subscript searching with multibyte characters ->énéb - - print ${a[(rb:1:)é,-1]} - print ${a[(rb:2:)é,-1]} - print ${a[(rb:3:)é,-1]} - print ${a[(rb:4:)é,-1]} - print ${a[(rb:5:)é,-1]} -0:Subscript searching with initial offset ->énébreux ->énébreux ->ébreux ->ébreux -> - - print ${a[(rn:1:)é,-1]} - print ${a[(rn:2:)é,-1]} - print ${a[(rn:3:)é,-1]} -0:Subscript searching with count ->énébreux ->ébreux -> - - print ${a[(R)én,(R)éb]} -0:Backward subscript searching with multibyte characters ->énéb - -# Starting offsets with (R) seem to be so strange as to be hardly -# worth testing. - - setopt extendedglob - [[ $a = (#b)t(én)(éb)reux ]] || print "Failed to match." >&2 - for i in {1..${#match}}; do - print $match[i] $mbegin[i] $mend[i] ${a[$mbegin[i],$mend[i]]} - done -0:Multibyte offsets in pattern tests ->én 2 3 én ->éb 4 5 éb - - b=${(U)a} - print $b - print ${(L)b} - desdichado="Je suis le $a, le veuf, l'inconsolé" - print ${(C)desdichado} - lxiv="l'état c'est moi" - print ${(C)lxiv} -0:Case modification of multibyte strings ->TÉNÉBREUX ->ténébreux ->Je Suis Le Ténébreux, Le Veuf, L'Inconsolé ->L'État C'Est Moi - - array=(ølaf ødd øpened án encyclopædia) - barray=(${(U)array}) - print $barray - print ${(L)barray} - print ${(C)array} - print ${(C)barray} -0:Case modification of arrays with multibyte strings ->ØLAF ØDD ØPENED ÁN ENCYCLOPÆDIA ->ølaf ødd øpened án encyclopædia ->Ølaf Ødd Øpened Án Encyclopædia ->Ølaf Ødd Øpened Án Encyclopædia - - print $(( ##¥ )) - pound=£ - print $(( #pound )) - alpha=α - print $(( ##α )) $(( #alpha )) -0:Conversion to Unicode in mathematical expressions ->165 ->163 ->945 945 - - unsetopt posix_identifiers - expr='hähä=3 || exit 1; print $hähä' - eval $expr - setopt posix_identifiers - (eval $expr) -1:POSIX_IDENTIFIERS option ->3 -?(eval):1: command not found: hähä=3 - - foo="Ølaf«Ødd«øpénëd«ån«àpple" - print -l ${(s.«.)foo} - ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος." - print -l ${=ioh} - print ${(w)#ioh} -0:Splitting with multibyte characters ->Ølaf ->Ødd ->øpénëd ->ån ->àpple ->Ἐν ->ἀρχῇ ->ἦν ->ὁ ->λόγος, ->καὶ ->ὁ ->λόγος ->ἦν ->πρὸς ->τὸν ->θεόν, ->καὶ ->θεὸς ->ἦν ->ὁ ->λόγος. ->17 - - read -d £ one - read -d £ two - print $one - print $two -0:read with multibyte delimiter -<first£second£ ->first ->second - - (IFS=« - read -d » -A array - print -l $array) -0:read -A with multibyte IFS -<dominus«illuminatio«mea»ignored ->dominus ->illuminatio ->mea - - read -k2 -u0 twochars - print $twochars -0:read multibyte characters -<«»ignored ->«» - - read -q -u0 mb - print $? -0:multibyte character makes read -q return false -<« ->1 - - # See if the system grokks first-century Greek... - ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος." - for (( i = 1; i <= ${#ioh}; i++ )); do - # FC3 doesn't recognise ῇ (U+1FC7: Greek small letter eta with - # perispomeni and ypogegrammeni, of course) as a lower case character. - if [[ $ioh[i] != [[:lower:]] && $i != 7 ]]; then - for tp in upper space punct invalid; do - if [[ $tp = invalid || $ioh[i] = [[:${tp}:]] ]]; then - print "$i: $tp" - break - fi - done - fi - done -0:isw* functions on non-ASCII wide characters ->1: upper ->3: space ->8: space ->11: space ->13: space ->19: punct ->20: space ->24: space ->26: space ->32: space ->35: space ->40: space ->44: space ->49: punct ->50: space ->54: space ->59: space ->62: space ->64: space ->70: punct - - ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος" - print ${ioh#[[:alpha:]]##} - print ${ioh##[[:alpha:]]##} - print ${ioh%[[:alpha:]]##} - print ${ioh%%[[:alpha:]]##} - print ${(S)ioh#λ*ς} - print ${(S)ioh##λ*ς} - print ${(S)ioh%θ*ς} - print ${(S)ioh%%θ*ς} -0:Parameter #, ##, %, %% with multibyte characters ->ν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος -> ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος ->Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγο ->Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ ->Ἐν ἀρχῇ ἦν ὁ , καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος ->Ἐν ἀρχῇ ἦν ὁ ->Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ ἦν ὁ λόγος ->Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ - - a="1ë34ë6" - print ${(BEN)a#*4} - print ${(BEN)a##*ë} - print ${(BEN)a%4*} - print ${(BEN)a%%ë*} - print ${(SBEN)a#ë3} - print ${(SBEN)a%4ë} -0:Flags B, E, N and S in ${...#...} and ${...%...} ->1 5 4 ->1 6 5 ->4 7 3 ->2 7 5 ->2 4 2 ->4 6 2 - - foo=(κατέβην χθὲς εἰς Πειραιᾶ) - print ${(l.3..¥.r.3..£.)foo} - print ${(l.4..¥.r.2..£.)foo} - print ${(l.5..¥.r.1..£.)foo} - print ${(l.4..¥..«.r.4..£..».)foo} - print ${(l.4..¥..Σωκράτης.r.4..£..Γλαύκωνος.)foo} -0:simultaneous left and right padding ->κατέβη ¥χθὲς£ ¥¥εἰς£ Πειραι ->¥κατέβ ¥¥χθὲς ¥¥¥εἰς ¥Πειρα ->¥¥κατέ ¥¥¥χθὲ ¥¥¥¥εἰ ¥¥Πειρ ->«κατέβην ¥«χθὲς»£ ¥¥«εἰς»£ «Πειραιᾶ ->ςκατέβην ηςχθὲςΓλ τηςεἰςΓλ ςΠειραιᾶ -# er... yeah, that looks right... - - foo=picobarn - print ${foo:s£bar£rod£:s¥rod¥stick¥} -0:Delimiters in modifiers ->picostickn - -# TODO: if we get paired multibyte bracket delimiters to work -# (as Emacs does, the smug so-and-so), the following should change. - foo=bar - print ${(r£5££X£)foo} - print ${(l«10««Y««HI«)foo} -0:Delimiters in parameter flags ->barXX ->YYYYYHIbar - - printf "%4.3s\n" főobar -0:Multibyte characters in printf widths -> főo - -# We ask for case-insensitive sorting here (and supply upper case -# characters) so that we exercise the logic in the shell that lowers the -# case of the string for case-insensitive sorting. - print -oi HÛH HÔH HÎH HÊH HÂH - (LC_ALL=C; print -oi HAH HUH HEH HÉH HÈH) -0:Multibyte characters in print sorting ->HÂH HÊH HÎH HÔH HÛH ->HAH HEH HUH HÈH HÉH - -# These are control characters in Unicode, so don't show up. -# We just want to check they're not being treated as tokens. - for x in {128..150}; do - print ${(#)x} - done | while read line; do - print ${#line} $(( #line )) - done -0:evaluated character number with multibyte characters ->1 128 ->1 129 ->1 130 ->1 131 ->1 132 ->1 133 ->1 134 ->1 135 ->1 136 ->1 137 ->1 138 ->1 139 ->1 140 ->1 141 ->1 142 ->1 143 ->1 144 ->1 145 ->1 146 ->1 147 ->1 148 ->1 149 ->1 150 - - touch ngs1txt ngs2txt ngs10txt ngs20txt ngs100txt ngs200txt - setopt numericglobsort - print -l ngs* -0:NUMERIC_GLOB_SORT option in UTF-8 locale ->ngs1txt ->ngs2txt ->ngs10txt ->ngs20txt ->ngs100txt ->ngs200txt - -# Not strictly multibyte, but gives us a well-defined locale for testing. - foo=$'X\xc0Y\x07Z\x7fT' - print -r ${(q)foo} -0:Backslash-quoting of unprintable/invalid characters uses $'...' ->X$'\300'Y$'\a'Z$'\177'T - -# This also isn't strictly multibyte and is here to reduce the -# likelihood of a "cannot do character set conversion" error. - (print $'\u00e9') 2>&1 | read - if [[ $REPLY != é ]]; then - print "warning: your system can't do simple Unicode conversion." >&$ZTST_fd - print "Check you have a correctly installed iconv library." >&$ZTST_fd - # cheat - repeat 4 print OK - else - testfn() { (LC_ALL=C; print $'\u00e9') } - repeat 4 testfn 2>&1 | while read line; do - if [[ $line = *"character not in range"* ]]; then - print OK - elif [[ $line = "?" ]]; then - print OK - else - print Failed: no error message and no question mark - fi - done - fi - true -0:error handling in Unicode quoting ->OK ->OK ->OK ->OK - - tmp1='glob/\(\)Ą/*' - [[ glob/'()Ą'/foo == $~tmp1 ]] && print "Matched against $tmp1" - tmp1='glob/\(\)Ā/*' - [[ glob/'()Ā'/bar == $~tmp1 ]] && print "Matched against $tmp1" -0:Backslashes and metafied characters in patterns ->Matched against glob/()Ą/* ->Matched against glob/()Ā/* - - mkdir 梶浦由記 'Пётр Ильич Чайковский' - (cd 梶浦由記; print ${${(%):-%~}:t}) - (cd 'Пётр Ильич Чайковский'; print ${${(%):-%~}:t}) -0:Metafied characters in prompt expansion ->梶浦由記 ->Пётр Ильич Чайковский - - ( - setopt nonomatch - tmp1=Ą - tmpA=(Ą 'Пётр Ильич Чайковский' 梶浦由記) - print ${tmp1} ${(%)tmp1} ${(%%)tmp1} - print ${#tmp1} ${#${(%)tmp1}} ${#${(%%)tmp1}} - print ${tmpA} - print ${(%)tmpA} - print ${(%%)tmpA} - ) -0:More metafied characters in prompt expansion ->Ą Ą Ą ->1 1 1 ->Ą Пётр Ильич Чайковский 梶浦由記 ->Ą Пётр Ильич Чайковский 梶浦由記 ->Ą Пётр Ильич Чайковский 梶浦由記 - - setopt cbases - print $'\xc5' | read - print $(( [#16] #REPLY )) -0:read passes through invalid multibyte characters ->0xC5 - - word=abcま - word[-1]= - print $word - word=abcま - word[-2]= - print $word - word=abcま - word[4]=d - print $word - word=abcま - word[3]=not_c - print $word -0:assignment with negative indices ->abc ->abま ->abcd ->abnot_cま - - # The following doesn't necessarily need UTF-8, but this gives - # us the full effect --- if we parse this wrongly the \xe9 - # in combination with the tokenized input afterwards looks like a - # valid UTF-8 character. But it isn't. - print $'$\xe9#``' >test_bad_param - (setopt nonomatch - . ./test_bad_param) -127:Invalid parameter name with following tokenized input -?./test_bad_param:1: command not found: $\M-i# - - lines=$'one\tZSH\tthree\nfour\tfive\tsix' - print -X8 -r -- $lines -0:Tab expansion with extra-wide characters ->one ZSH three ->four five six -# This doesn't look aligned in my editor because actually the characters -# aren't quite double width, but the arithmetic is correct. -# It appears just to be an effect of the font. - - () { - emulate -L zsh - setopt errreturn - local cdpath=(.) - mkdir ホ - cd ホ - cd .. - cd ./ホ - cd .. - } -0:cd with special characters - - test_array=( - '[[ \xcc = \xcc ]]' - '[[ \xcc != \xcd ]]' - '[[ \xcc != \ucc ]]' - '[[ \ucc = \ucc ]]' - '[[ \ucc = [\ucc] ]]' - '[[ \xcc != [\ucc] ]]' - # Not clear how useful the following is... - '[[ \xcc = [\xcc] ]]' - ) - for test in $test_array; do - if ! eval ${(g::)test} ; then - print -rl "Test $test failed" >&2 - fi - done -0:Invalid characters in pattern matching - - [[ $'\xe3' == [[:INCOMPLETE:]] ]] || print fail 1 - [[ $'\xe3\x83' == [[:INCOMPLETE:]][[:INVALID:]] ]] || print fail 2 - [[ $'\xe3\x83\x9b' != [[:INCOMPLETE:][:INVALID:]] ]] || print fail 3 - [[ $'\xe3\x83\x9b' = ? ]] || print fail 4 -0:Testing incomplete and invalid multibyte character components - - print -r -- ${(q+):-ホ} - foo='She said "ホ". I said "You can'\''t '\''ホ'\'' me!' - print -r -- ${(q+)foo} -0:${(q+)...} with printable multibyte characters ->ホ ->'She said "ホ". I said "You can'\''t '\''ホ'\'' me!' - -# This will silently succeed if zsh/parameter isn't available - (zmodload zsh/parameter >/dev/null 2>&1 - f() { - : $(:) - "↓" - } - : $functions) -0:Multibyte handling of functions parameter - -# c1=U+0104 (Ą) and c2=U+0120 (Ġ) are chosen so that -# u1 = utf8(c1) = c4 84 < u2 = utf8(c2) = c4 a0 -# metafy(u1) = c4 83 a4 > metafy(u2) = c4 83 80 -# in both UTF-8 and ASCII collations (the latter is used in macOS -# and some versions of BSDs). - local -a names=( $'\u0104' $'\u0120' ) - print -o $names - mkdir -p colltest - cd colltest - touch $names - print ? -0:Sorting of metafied characters ->Ą Ġ ->Ą Ġ - - printf '%q%q\n' 你你 -0:printf %q and quotestring and general metafy / token madness ->你你 - -# This test is kept last as it introduces an additional -# dependency on the system regex library. - if zmodload zsh/regex 2>/dev/null; then - [[ $'\ua0' =~ '^.$' ]] && print OK - [[ $'\ua0' =~ $'^\ua0$' ]] && print OK - [[ $'\ua0'X =~ '^X$' ]] || print OK - else - ZTST_skip="regexp library not found." - fi -0:Ensure no confusion on metafied input to regex module ->OK ->OK ->OK -F:A failure here may indicate the system regex library does not -F:support character sets outside the portable 7-bit range. |
