utf32.js 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307
  1. "use strict"
  2. var Buffer = require("safer-buffer").Buffer
  3. // == UTF32-LE/BE codec. ==========================================================
  4. exports._utf32 = Utf32Codec
  5. function Utf32Codec (codecOptions, iconv) {
  6. this.iconv = iconv
  7. this.bomAware = true
  8. this.isLE = codecOptions.isLE
  9. }
  10. exports.utf32le = { type: "_utf32", isLE: true }
  11. exports.utf32be = { type: "_utf32", isLE: false }
  12. // Aliases
  13. exports.ucs4le = "utf32le"
  14. exports.ucs4be = "utf32be"
  15. Utf32Codec.prototype.encoder = Utf32Encoder
  16. Utf32Codec.prototype.decoder = Utf32Decoder
  17. // -- Encoding
  18. function Utf32Encoder (options, codec) {
  19. this.isLE = codec.isLE
  20. this.highSurrogate = 0
  21. }
  22. Utf32Encoder.prototype.write = function (str) {
  23. var src = Buffer.from(str, "ucs2")
  24. var dst = Buffer.alloc(src.length * 2)
  25. var write32 = this.isLE ? dst.writeUInt32LE : dst.writeUInt32BE
  26. var offset = 0
  27. for (var i = 0; i < src.length; i += 2) {
  28. var code = src.readUInt16LE(i)
  29. var isHighSurrogate = (code >= 0xD800 && code < 0xDC00)
  30. var isLowSurrogate = (code >= 0xDC00 && code < 0xE000)
  31. if (this.highSurrogate) {
  32. if (isHighSurrogate || !isLowSurrogate) {
  33. // There shouldn't be two high surrogates in a row, nor a high surrogate which isn't followed by a low
  34. // surrogate. If this happens, keep the pending high surrogate as a stand-alone semi-invalid character
  35. // (technically wrong, but expected by some applications, like Windows file names).
  36. write32.call(dst, this.highSurrogate, offset)
  37. offset += 4
  38. } else {
  39. // Create 32-bit value from high and low surrogates;
  40. var codepoint = (((this.highSurrogate - 0xD800) << 10) | (code - 0xDC00)) + 0x10000
  41. write32.call(dst, codepoint, offset)
  42. offset += 4
  43. this.highSurrogate = 0
  44. continue
  45. }
  46. }
  47. if (isHighSurrogate) { this.highSurrogate = code } else {
  48. // Even if the current character is a low surrogate, with no previous high surrogate, we'll
  49. // encode it as a semi-invalid stand-alone character for the same reasons expressed above for
  50. // unpaired high surrogates.
  51. write32.call(dst, code, offset)
  52. offset += 4
  53. this.highSurrogate = 0
  54. }
  55. }
  56. if (offset < dst.length) { dst = dst.slice(0, offset) }
  57. return dst
  58. }
  59. Utf32Encoder.prototype.end = function () {
  60. // Treat any leftover high surrogate as a semi-valid independent character.
  61. if (!this.highSurrogate) { return }
  62. var buf = Buffer.alloc(4)
  63. if (this.isLE) { buf.writeUInt32LE(this.highSurrogate, 0) } else { buf.writeUInt32BE(this.highSurrogate, 0) }
  64. this.highSurrogate = 0
  65. return buf
  66. }
  67. // -- Decoding
  68. function Utf32Decoder (options, codec) {
  69. this.isLE = codec.isLE
  70. this.badChar = codec.iconv.defaultCharUnicode.charCodeAt(0)
  71. this.overflow = []
  72. }
  73. Utf32Decoder.prototype.write = function (src) {
  74. if (src.length === 0) { return "" }
  75. var i = 0
  76. var codepoint = 0
  77. var dst = Buffer.alloc(src.length + 4)
  78. var offset = 0
  79. var isLE = this.isLE
  80. var overflow = this.overflow
  81. var badChar = this.badChar
  82. if (overflow.length > 0) {
  83. for (; i < src.length && overflow.length < 4; i++) { overflow.push(src[i]) }
  84. if (overflow.length === 4) {
  85. // NOTE: codepoint is a signed int32 and can be negative.
  86. // NOTE: We copied this block from below to help V8 optimize it (it works with array, not buffer).
  87. if (isLE) {
  88. codepoint = overflow[i] | (overflow[i + 1] << 8) | (overflow[i + 2] << 16) | (overflow[i + 3] << 24)
  89. } else {
  90. codepoint = overflow[i + 3] | (overflow[i + 2] << 8) | (overflow[i + 1] << 16) | (overflow[i] << 24)
  91. }
  92. overflow.length = 0
  93. offset = _writeCodepoint(dst, offset, codepoint, badChar)
  94. }
  95. }
  96. // Main loop. Should be as optimized as possible.
  97. for (; i < src.length - 3; i += 4) {
  98. // NOTE: codepoint is a signed int32 and can be negative.
  99. if (isLE) {
  100. codepoint = src[i] | (src[i + 1] << 8) | (src[i + 2] << 16) | (src[i + 3] << 24)
  101. } else {
  102. codepoint = src[i + 3] | (src[i + 2] << 8) | (src[i + 1] << 16) | (src[i] << 24)
  103. }
  104. offset = _writeCodepoint(dst, offset, codepoint, badChar)
  105. }
  106. // Keep overflowing bytes.
  107. for (; i < src.length; i++) {
  108. overflow.push(src[i])
  109. }
  110. return dst.slice(0, offset).toString("ucs2")
  111. }
  112. function _writeCodepoint (dst, offset, codepoint, badChar) {
  113. // NOTE: codepoint is signed int32 and can be negative. We keep it that way to help V8 with optimizations.
  114. if (codepoint < 0 || codepoint > 0x10FFFF) {
  115. // Not a valid Unicode codepoint
  116. codepoint = badChar
  117. }
  118. // Ephemeral Planes: Write high surrogate.
  119. if (codepoint >= 0x10000) {
  120. codepoint -= 0x10000
  121. var high = 0xD800 | (codepoint >> 10)
  122. dst[offset++] = high & 0xff
  123. dst[offset++] = high >> 8
  124. // Low surrogate is written below.
  125. var codepoint = 0xDC00 | (codepoint & 0x3FF)
  126. }
  127. // Write BMP char or low surrogate.
  128. dst[offset++] = codepoint & 0xff
  129. dst[offset++] = codepoint >> 8
  130. return offset
  131. };
  132. Utf32Decoder.prototype.end = function () {
  133. this.overflow.length = 0
  134. }
  135. // == UTF-32 Auto codec =============================================================
  136. // Decoder chooses automatically from UTF-32LE and UTF-32BE using BOM and space-based heuristic.
  137. // Defaults to UTF-32LE. http://en.wikipedia.org/wiki/UTF-32
  138. // Encoder/decoder default can be changed: iconv.decode(buf, 'utf32', {defaultEncoding: 'utf-32be'});
  139. // Encoder prepends BOM (which can be overridden with (addBOM: false}).
  140. exports.utf32 = Utf32AutoCodec
  141. exports.ucs4 = "utf32"
  142. function Utf32AutoCodec (options, iconv) {
  143. this.iconv = iconv
  144. }
  145. Utf32AutoCodec.prototype.encoder = Utf32AutoEncoder
  146. Utf32AutoCodec.prototype.decoder = Utf32AutoDecoder
  147. // -- Encoding
  148. function Utf32AutoEncoder (options, codec) {
  149. options = options || {}
  150. if (options.addBOM === undefined) {
  151. options.addBOM = true
  152. }
  153. this.encoder = codec.iconv.getEncoder(options.defaultEncoding || "utf-32le", options)
  154. }
  155. Utf32AutoEncoder.prototype.write = function (str) {
  156. return this.encoder.write(str)
  157. }
  158. Utf32AutoEncoder.prototype.end = function () {
  159. return this.encoder.end()
  160. }
  161. // -- Decoding
  162. function Utf32AutoDecoder (options, codec) {
  163. this.decoder = null
  164. this.initialBufs = []
  165. this.initialBufsLen = 0
  166. this.options = options || {}
  167. this.iconv = codec.iconv
  168. }
  169. Utf32AutoDecoder.prototype.write = function (buf) {
  170. if (!this.decoder) {
  171. // Codec is not chosen yet. Accumulate initial bytes.
  172. this.initialBufs.push(buf)
  173. this.initialBufsLen += buf.length
  174. if (this.initialBufsLen < 32) // We need more bytes to use space heuristic (see below)
  175. { return "" }
  176. // We have enough bytes -> detect endianness.
  177. var encoding = detectEncoding(this.initialBufs, this.options.defaultEncoding)
  178. this.decoder = this.iconv.getDecoder(encoding, this.options)
  179. var resStr = ""
  180. for (var i = 0; i < this.initialBufs.length; i++) { resStr += this.decoder.write(this.initialBufs[i]) }
  181. this.initialBufs.length = this.initialBufsLen = 0
  182. return resStr
  183. }
  184. return this.decoder.write(buf)
  185. }
  186. Utf32AutoDecoder.prototype.end = function () {
  187. if (!this.decoder) {
  188. var encoding = detectEncoding(this.initialBufs, this.options.defaultEncoding)
  189. this.decoder = this.iconv.getDecoder(encoding, this.options)
  190. var resStr = ""
  191. for (var i = 0; i < this.initialBufs.length; i++) { resStr += this.decoder.write(this.initialBufs[i]) }
  192. var trail = this.decoder.end()
  193. if (trail) { resStr += trail }
  194. this.initialBufs.length = this.initialBufsLen = 0
  195. return resStr
  196. }
  197. return this.decoder.end()
  198. }
  199. function detectEncoding (bufs, defaultEncoding) {
  200. var b = []
  201. var charsProcessed = 0
  202. var invalidLE = 0; var invalidBE = 0 // Number of invalid chars when decoded as LE or BE.
  203. var bmpCharsLE = 0; var bmpCharsBE = 0 // Number of BMP chars when decoded as LE or BE.
  204. outerLoop:
  205. for (var i = 0; i < bufs.length; i++) {
  206. var buf = bufs[i]
  207. for (var j = 0; j < buf.length; j++) {
  208. b.push(buf[j])
  209. if (b.length === 4) {
  210. if (charsProcessed === 0) {
  211. // Check BOM first.
  212. if (b[0] === 0xFF && b[1] === 0xFE && b[2] === 0 && b[3] === 0) {
  213. return "utf-32le"
  214. }
  215. if (b[0] === 0 && b[1] === 0 && b[2] === 0xFE && b[3] === 0xFF) {
  216. return "utf-32be"
  217. }
  218. }
  219. if (b[0] !== 0 || b[1] > 0x10) invalidBE++
  220. if (b[3] !== 0 || b[2] > 0x10) invalidLE++
  221. if (b[0] === 0 && b[1] === 0 && (b[2] !== 0 || b[3] !== 0)) bmpCharsBE++
  222. if ((b[0] !== 0 || b[1] !== 0) && b[2] === 0 && b[3] === 0) bmpCharsLE++
  223. b.length = 0
  224. charsProcessed++
  225. if (charsProcessed >= 100) {
  226. break outerLoop
  227. }
  228. }
  229. }
  230. }
  231. // Make decisions.
  232. if (bmpCharsBE - invalidBE > bmpCharsLE - invalidLE) return "utf-32be"
  233. if (bmpCharsBE - invalidBE < bmpCharsLE - invalidLE) return "utf-32le"
  234. // Couldn't decide (likely all zeros or not enough data).
  235. return defaultEncoding || "utf-32le"
  236. }