This is a sparse Unicode table. Codepoints without entries are assumed to have the value: [0, 0, nil, nil, nil, nil, nil]
This module is loosely based on idn_actionmailer by Mick Staugaard, the unicode library by Yoshida Masato, and the punycode implementation by Kazuhiro Nishiyama. Most of the code was copied verbatim, but some reformatting was done, and some translation from C was done.
Without their code to work from as a base, we'd all still be relying on the presence of libidn. Which nobody ever seems to have installed.
Original sources: github.com/staugaard/idn_actionmailer www.yoshidam.net/Ruby.html#unicode rubyforge.org/frs/?group_id=2550
# File lib/addressable/idna/native.rb, line 34 def self.to_ascii(value) IDN::Idna.toASCII(value) end
# File lib/addressable/idna/native.rb, line 38 def self.to_unicode(value) IDN::Idna.toUnicode(value) end
# File lib/addressable/idna/native.rb, line 30 def self.unicode_normalize_kc(value) IDN::Stringprep.nfkc_normalize(value) end
# File lib/addressable/idna/pure.rb, line 273 def self.lookup_unicode_combining_class(codepoint) codepoint_data = UNICODE_DATA[codepoint] (codepoint_data ? (codepoint_data[UNICODE_DATA_COMBINING_CLASS] || 0) : 0) end
# File lib/addressable/idna/pure.rb, line 281 def self.lookup_unicode_compatibility(codepoint) codepoint_data = UNICODE_DATA[codepoint] (codepoint_data ? codepoint_data[UNICODE_DATA_COMPATIBILITY] : nil) end
# File lib/addressable/idna/pure.rb, line 296 def self.lookup_unicode_composition(unpacked) return COMPOSITION_TABLE[unpacked] end
# File lib/addressable/idna/pure.rb, line 288 def self.lookup_unicode_lowercase(codepoint) codepoint_data = UNICODE_DATA[codepoint] (codepoint_data ? (codepoint_data[UNICODE_DATA_LOWERCASE] || codepoint) : codepoint) end
Bias adaptation method
# File lib/addressable/idna/pure.rb, line 640 def self.punycode_adapt(delta, numpoints, firsttime) delta = firsttime ? delta / PUNYCODE_DAMP : delta >> 1 # delta >> 1 is a faster way of doing delta / 2 delta += delta / numpoints difference = PUNYCODE_BASE - PUNYCODE_TMIN k = 0 while delta > (difference * PUNYCODE_TMAX) / 2 delta /= difference k += PUNYCODE_BASE end k + (difference + 1) * delta / (delta + PUNYCODE_SKEW) end
# File lib/addressable/idna/pure.rb, line 608 def self.punycode_basic?(codepoint) codepoint < 0x80 end
# File lib/addressable/idna/native.rb, line 26 def self.punycode_decode(value) IDN::Punycode.decode(value) end
Returns the numeric value of a basic codepoint (for use in representing integers) in the range 0 to base - 1, or PUNYCODE_BASE if codepoint does not represent a value.
# File lib/addressable/idna/pure.rb, line 626 def self.punycode_decode_digit(codepoint) if codepoint - 48 < 10 codepoint - 22 elsif codepoint - 65 < 26 codepoint - 65 elsif codepoint - 97 < 26 codepoint - 97 else PUNYCODE_BASE end end
# File lib/addressable/idna/pure.rb, line 613 def self.punycode_delimiter?(codepoint) codepoint == PUNYCODE_DELIMITER end
# File lib/addressable/idna/native.rb, line 22 def self.punycode_encode(value) IDN::Punycode.encode(value) end
# File lib/addressable/idna/pure.rb, line 618 def self.punycode_encode_digit(d) d + 22 + 75 * ((d < 26) ? 1 : 0) end
# File lib/addressable/idna/pure.rb, line 128 def self.unicode_compose(unpacked) unpacked_result = [] length = unpacked.length return unpacked if length == 0 starter = unpacked[0] starter_cc = lookup_unicode_combining_class(starter) starter_cc = 256 if starter_cc != 0 for i in 1...length ch = unpacked[i] cc = lookup_unicode_combining_class(ch) if (starter_cc == 0 && (composite = unicode_compose_pair(starter, ch)) != nil) starter = composite startercc = lookup_unicode_combining_class(composite) else unpacked_result << starter starter = ch startercc = cc end end unpacked_result << starter return unpacked_result end
# File lib/addressable/idna/pure.rb, line 156 def self.unicode_compose_pair(ch_one, ch_two) if ch_one >= HANGUL_LBASE && ch_one < HANGUL_LBASE + HANGUL_LCOUNT && ch_two >= HANGUL_VBASE && ch_two < HANGUL_VBASE + HANGUL_VCOUNT # Hangul L + V return HANGUL_SBASE + ( (ch_one - HANGUL_LBASE) * HANGUL_VCOUNT + (ch_two - HANGUL_VBASE) ) * HANGUL_TCOUNT elsif ch_one >= HANGUL_SBASE && ch_one < HANGUL_SBASE + HANGUL_SCOUNT && (ch_one - HANGUL_SBASE) % HANGUL_TCOUNT == 0 && ch_two >= HANGUL_TBASE && ch_two < HANGUL_TBASE + HANGUL_TCOUNT # Hangul LV + T return ch_one + (ch_two - HANGUL_TBASE) end p = [] ucs4_to_utf8 = lambda do |ch| # For some reason, rcov likes to drop BUS errors here. if ch < 128 p << ch elsif ch < 2048 p << (ch >> 6 | 192) p << (ch & 63 | 128) elsif ch < 0x10000 p << (ch >> 12 | 224) p << (ch >> 6 & 63 | 128) p << (ch & 63 | 128) elsif ch < 0x200000 p << (ch >> 18 | 240) p << (ch >> 12 & 63 | 128) p << (ch >> 6 & 63 | 128) p << (ch & 63 | 128) elsif ch < 0x4000000 p << (ch >> 24 | 248) p << (ch >> 18 & 63 | 128) p << (ch >> 12 & 63 | 128) p << (ch >> 6 & 63 | 128) p << (ch & 63 | 128) elsif ch < 0x80000000 p << (ch >> 30 | 252) p << (ch >> 24 & 63 | 128) p << (ch >> 18 & 63 | 128) p << (ch >> 12 & 63 | 128) p << (ch >> 6 & 63 | 128) p << (ch & 63 | 128) end end ucs4_to_utf8.call(ch_one) ucs4_to_utf8.call(ch_two) return lookup_unicode_composition(p) end
# File lib/addressable/idna/pure.rb, line 235 def self.unicode_decompose(unpacked) unpacked_result = [] for cp in unpacked if cp >= HANGUL_SBASE && cp < HANGUL_SBASE + HANGUL_SCOUNT l, v, t = unicode_decompose_hangul(cp) unpacked_result << l unpacked_result << v if v unpacked_result << t if t else dc = lookup_unicode_compatibility(cp) unless dc unpacked_result << cp else unpacked_result.concat(unicode_decompose(dc.unpack("U*"))) end end end return unpacked_result end
# File lib/addressable/idna/pure.rb, line 256 def self.unicode_decompose_hangul(codepoint) sindex = codepoint - HANGUL_SBASE; if sindex < 0 || sindex >= HANGUL_SCOUNT l = codepoint v = t = nil return l, v, t end l = HANGUL_LBASE + sindex / HANGUL_NCOUNT v = HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT t = HANGUL_TBASE + sindex % HANGUL_TCOUNT if t == HANGUL_TBASE t = nil end return l, v, t end
Unicode aware downcase method.
@api private @param [String] input
The input string.
@return [String] The downcased result.
# File lib/addressable/idna/pure.rb, line 121 def self.unicode_downcase(input) unpacked = input.unpack("U*") unpacked.map! { |codepoint| lookup_unicode_lowercase(codepoint) } return unpacked.pack("U*") end
# File lib/addressable/idna/pure.rb, line 211 def self.unicode_sort_canonical(unpacked) unpacked = unpacked.dup i = 1 length = unpacked.length return unpacked if length < 2 while i < length last = unpacked[i-1] ch = unpacked[i] last_cc = lookup_unicode_combining_class(last) cc = lookup_unicode_combining_class(ch) if cc != 0 && last_cc != 0 && last_cc > cc unpacked[i] = last unpacked[i-1] = ch i -= 1 if i > 1 else i += 1 end end return unpacked end