字符编码具有两个不同的组件:
-
编码器,将字符序列转换为数值序列 (字节)。
-
解码器,将字节序列转换为字符序列。
(代理对包含代码中的字符点从 U+D800 范围到后面的字符 U+DBFF 的范围到 U+DFFF。) 回退策略确定编码器如何处理无效字符或解码器如何处理无效字节。
|
|
|---|
|
.ToBase64String 方法。 |
它们使用解码器将字符从非 Unicode 编码映射为 Unicode。
本主题包含以下几节:
访问 .NET framework 中实现的单个编码对象,请执行以下操作:
-
Replacement Fallback 一节。)
-
Exception Fallback 节。)
-
Best-Fit Fallback 一节。)
-
重载允许您为编码器和解码器指定回退对象。
|
|
|---|
|
应用程序开发人员不必再跟踪用于生成特定语言或编写系统字符的编码方案,并且,数据可以在世界上的各系统之间共享,而不会受到损坏。 Unicode 主页。 |
.NET framework 支持下表中列出的字符编码系统。
|
编码 |
类 |
说明 |
优点/缺点 |
|---|---|---|---|
|
ASCII |
使用字节较低的七位,对有限的字符。 |
由于此编码仅支持从 U+0000 到 U+007F 的字符值,在许多情况下它对于国际化应用程序不够用的。 |
|
|
UTF-7 |
非 ASCII Unicode 字符的 ASCII 字符转义序列表示。 |
因此,应该尽可能使用 UTF-8 而不是 UTF-7。 |
|
|
UTF-8 |
每个 Unicode 码位表示为一到四个字节序列。 |
|
|
|
UTF-16 |
little-endian 和 big-endian 字节顺序支持。 |
WCHAR 值。 |
|
|
UTF-32 |
little-endian 和 big-endian 字节顺序支持。 |
在显示呈现的单个标志符号多个多个 UTF-32 字符。 |
|
|
ANSI/ISO 编码 |
.GetEncoding(Int32) 方法检索特定代码页的编码对象。 |
因此,依赖 ANSI 代码页的应用程序在同一文本流不能将希腊语和德语存储,除非它包含指示引用的代码页的标识符。 |
|
|
双字节字符集 (dbcs) 编码 |
.GetEncoding(Int32) 方法检索特定 DBCS 的编码对象。 |
因为一对双字节码位可以将代码页表示不同的字符,则此方案依然不允许两种语言的组合,如日语和中文),在同一数据流。 |
Encoding 并重写其成员派生的类来创建自定义编码。
GetByteCount 方法。
[], Int32) 输入该字符串中的字符。
Imports System.Text Module Example Public Sub Main() Dim strings() As String = { "This is the first sentence. ", "This is the second sentence. " } Dim asciiEncoding As Encoding = Encoding.ASCII ' Create array of adequate size. Dim bytes(50) As Byte ' Create index for current position of array. Dim index As Integer = 0 Console.WriteLine("Strings to encode:") For Each stringValue In strings Console.WriteLine(" {0}", stringValue) Dim count As Integer = asciiEncoding.GetByteCount(stringValue) If count + index >= bytes.Length Then Array.Resize(bytes, bytes.Length + 50) End If Dim written As Integer = asciiEncoding.GetBytes(stringValue, 0, stringValue.Length, bytes, index) index = index + written Next Console.WriteLine() Console.WriteLine("Encoded bytes:") Console.WriteLine("{0}", ShowByteValues(bytes, index)) Console.WriteLine() ' Decode Unicode byte array to a string. Dim newString As String = asciiEncoding.GetString(bytes, 0, index) Console.WriteLine("Decoded: {0}", newString) End Sub Private Function ShowByteValues(bytes As Byte(), last As Integer) As String Dim returnString As String = " " For ctr As Integer = 0 To last - 1 If ctr Mod 20 = 0 Then returnString += vbCrLf + " " returnString += String.Format("{0:X2} ", bytes(ctr)) Next Return returnString End Function End Module ' The example displays the following output: ' Strings to encode: ' This is the first sentence. ' This is the second sentence. ' ' Encoded bytes: ' ' 54 68 69 73 20 69 73 20 74 68 65 20 66 69 72 73 74 20 73 65 ' 6E 74 65 6E 63 65 2E 20 54 68 69 73 20 69 73 20 74 68 65 20 ' 73 65 63 6F 6E 64 20 73 65 6E 74 65 6E 63 65 2E 20 ' ' Decoded: This is the first sentence. This is the second sentence.
using System; using System.Text; public class Example { public static void Main() { string[] strings= { "This is the first sentence. ", "This is the second sentence. " }; Encoding asciiEncoding = Encoding.ASCII; // Create array of adequate size. byte[] bytes = new byte[49]; // Create index for current position of array. int index = 0; Console.WriteLine("Strings to encode:"); foreach (var stringValue in strings) { Console.WriteLine(" {0}", stringValue); int count = asciiEncoding.GetByteCount(stringValue); if (count + index >= bytes.Length) Array.Resize(ref bytes, bytes.Length + 50); int written = asciiEncoding.GetBytes(stringValue, 0, stringValue.Length, bytes, index); index = index + written; } Console.WriteLine("\nEncoded bytes:"); Console.WriteLine("{0}", ShowByteValues(bytes, index)); Console.WriteLine(); // Decode Unicode byte array to a string. string newString = asciiEncoding.GetString(bytes, 0, index); Console.WriteLine("Decoded: {0}", newString); } private static string ShowByteValues(byte[] bytes, int last ) { string returnString = " "; for (int ctr = 0; ctr <= last - 1; ctr++) { if (ctr % 20 == 0) returnString += "\n "; returnString += String.Format("{0:X2} ", bytes[ctr]); } return returnString; } } // The example displays the following output: // Strings to encode: // This is the first sentence. // This is the second sentence. // // Encoded bytes: // // 54 68 69 73 20 69 73 20 74 68 65 20 66 69 72 73 74 20 73 65 // 6E 74 65 6E 63 65 2E 20 54 68 69 73 20 69 73 20 74 68 65 20 // 73 65 63 6F 6E 64 20 73 65 6E 74 65 6E 63 65 2E 20 // // Decoded: This is the first sentence. This is the second sentence.
GetCharCount 方法。
[], Int32) 方法解码字节数组。
Imports System.Text Module Example Public Sub Main() Dim strings() As String = { "This is the first sentence. ", "This is the second sentence. ", "This is the third sentence. " } Dim asciiEncoding As Encoding = Encoding.ASCII ' Array to hold encoded bytes. Dim bytes() As Byte ' Array to hold decoded characters. Dim chars(50) As Char ' Create index for current position of character array. Dim index As Integer For Each stringValue In strings Console.WriteLine("String to Encode: {0}", stringValue) ' Encode the string to a byte array. bytes = asciiEncoding.GetBytes(stringValue) ' Display the encoded bytes. Console.Write("Encoded bytes: ") For ctr As Integer = 0 To bytes.Length - 1 Console.Write(" {0}{1:X2}", If(ctr Mod 20 = 0, vbCrLf, ""), bytes(ctr)) Next Console.WriteLine() ' Decode the bytes to a single character array. Dim count As Integer = asciiEncoding.GetCharCount(bytes) If count + index >= chars.Length Then Array.Resize(chars, chars.Length + 50) End If Dim written As Integer = asciiEncoding.GetChars(bytes, 0, bytes.Length, chars, index) index = index + written Console.WriteLine() Next ' Instantiate a single string containing the characters. Dim decodedString As New String(chars, 0, index - 1) Console.WriteLine("Decoded string: ") Console.WriteLine(decodedString) End Sub End Module ' The example displays the following output: ' String to Encode: This is the first sentence. ' Encoded bytes: ' 54 68 69 73 20 69 73 20 74 68 65 20 66 69 72 73 74 20 73 65 ' 6E 74 65 6E 63 65 2E 20 ' ' String to Encode: This is the second sentence. ' Encoded bytes: ' 54 68 69 73 20 69 73 20 74 68 65 20 73 65 63 6F 6E 64 20 73 ' 65 6E 74 65 6E 63 65 2E 20 ' ' String to Encode: This is the third sentence. ' Encoded bytes: ' 54 68 69 73 20 69 73 20 74 68 65 20 74 68 69 72 64 20 73 65 ' 6E 74 65 6E 63 65 2E 20 ' ' Decoded string: ' This is the first sentence. This is the second sentence. This is the third sentence.
using System; using System.Text; public class Example { public static void Main() { string[] strings = { "This is the first sentence. ", "This is the second sentence. ", "This is the third sentence. " }; Encoding asciiEncoding = Encoding.ASCII; // Array to hold encoded bytes. byte[] bytes; // Array to hold decoded characters. char[] chars = new char[50]; // Create index for current position of character array. int index = 0; foreach (var stringValue in strings) { Console.WriteLine("String to Encode: {0}", stringValue); // Encode the string to a byte array. bytes = asciiEncoding.GetBytes(stringValue); // Display the encoded bytes. Console.Write("Encoded bytes: "); for (int ctr = 0; ctr < bytes.Length; ctr++) Console.Write(" {0}{1:X2}", ctr % 20 == 0 ? Environment.NewLine : "", bytes[ctr]); Console.WriteLine(); // Decode the bytes to a single character array. int count = asciiEncoding.GetCharCount(bytes); if (count + index >= chars.Length) Array.Resize(ref chars, chars.Length + 50); int written = asciiEncoding.GetChars(bytes, 0, bytes.Length, chars, index); index = index + written; Console.WriteLine(); } // Instantiate a single string containing the characters. string decodedString = new string(chars, 0, index - 1); Console.WriteLine("Decoded string: "); Console.WriteLine(decodedString); } } // The example displays the following output: // String to Encode: This is the first sentence. // Encoded bytes: // 54 68 69 73 20 69 73 20 74 68 65 20 66 69 72 73 74 20 73 65 // 6E 74 65 6E 63 65 2E 20 // // String to Encode: This is the second sentence. // Encoded bytes: // 54 68 69 73 20 69 73 20 74 68 65 20 73 65 63 6F 6E 64 20 73 // 65 6E 74 65 6E 63 65 2E 20 // // String to Encode: This is the third sentence. // Encoded bytes: // 54 68 69 73 20 69 73 20 74 68 65 20 74 68 69 72 64 20 73 65 // 6E 74 65 6E 63 65 2E 20 // // Decoded string: // This is the first sentence. This is the second sentence. This is the third sentence.
Decoder 能够处理编码,并且范围的解码操作多个方法调用。
.GetString的方法。
.GetChars 方法成功解码字节数组获取原始字符串。
Imports System.IO Imports System.Text Module Example Public Sub Main() ' Use default replacement fallback for invalid encoding. Dim enc As New UnicodeEncoding(True, False, False) ' Define a string with various Unicode characters. Dim str1 As String = String.Format("AB YZ 19 {0}{1} {2}", ChrW(&hD800), ChrW(&hDC05), ChrW(&h00e4)) str1 += String.Format("Unicode characters. {0} {1} s {2}{3}", ChrW(&h00a9), ChrW(&h010C), ChrW(&h0062), ChrW(&h0308)) Console.WriteLine("Created original string...") Console.WriteLine() ' Convert string to byte array. Dim bytes() As Byte = enc.GetBytes(str1) Dim fs As FileStream = File.Create(".\characters.bin") Dim bw As New BinaryWriter(fs) bw.Write(bytes) bw.Close() ' Read bytes from file. Dim fsIn As FileStream = File.OpenRead(".\characters.bin") Dim br As New BinaryReader(fsIn) Const count As Integer = 10 ' Number of bytes to read at a time. Dim bytesRead(9) As Byte ' Buffer (byte array). Dim read As Integer ' Number of bytes actually read. Dim str2 As String = "" ' Decoded string. ' Try using Encoding object for all operations. Do read = br.Read(bytesRead, 0, count) str2 += enc.GetString(bytesRead, 0, read) Loop While read = count br.Close() Console.WriteLine("Decoded string using UnicodeEncoding.GetString()...") CompareForEquality(str1, str2) Console.WriteLine() ' Use Decoder for all operations. fsIn = File.OpenRead(".\characters.bin") br = New BinaryReader(fsIn) Dim decoder As Decoder = enc.GetDecoder() Dim chars(50) As Char Dim index As Integer = 0 ' Next character to write in array. Dim written As Integer = 0 ' Number of chars written to array. Do read = br.Read(bytesRead, 0, count) If index + decoder.GetCharCount(bytesRead, 0, read) - 1 >= chars.Length Then Array.Resize(chars, chars.Length + 50) End If written = decoder.GetChars(bytesRead, 0, read, chars, index) index += written Loop While read = count br.Close() ' Instantiate a string with the decoded characters. Dim str3 As New String(chars, 0, index) Console.WriteLine("Decoded string using UnicodeEncoding.Decoder.GetString()...") CompareForEquality(str1, str3) End Sub Private Sub CompareForEquality(original As String, decoded As String) Dim result As Boolean = original.Equals(decoded) Console.WriteLine("original = decoded: {0}", original.Equals(decoded, StringComparison.Ordinal)) If Not result Then Console.WriteLine("Code points in original string:") For Each ch In original Console.Write("{0} ", Convert.ToUInt16(ch).ToString("X4")) Next Console.WriteLine() Console.WriteLine("Code points in decoded string:") For Each ch In decoded Console.Write("{0} ", Convert.ToUInt16(ch).ToString("X4")) Next Console.WriteLine() End If End Sub End Module ' The example displays the following output: ' Created original string... ' ' Decoded string using UnicodeEncoding.GetString()... ' original = decoded: False ' Code points in original string: ' 0041 0042 0020 0059 005A 0020 0031 0039 0020 D800 DC05 0020 00E4 0055 006E 0069 0063 006F ' 0064 0065 0020 0063 0068 0061 0072 0061 0063 0074 0065 0072 0073 002E 0020 00A9 0020 010C ' 0020 0073 0020 0062 0308 ' Code points in decoded string: ' 0041 0042 0020 0059 005A 0020 0031 0039 0020 FFFD FFFD 0020 00E4 0055 006E 0069 0063 006F ' 0064 0065 0020 0063 0068 0061 0072 0061 0063 0074 0065 0072 0073 002E 0020 00A9 0020 010C ' 0020 0073 0020 0062 0308 ' ' Decoded string using UnicodeEncoding.Decoder.GetString()... ' original = decoded: True
using System; using System.IO; using System.Text; public class Example { public static void Main() { // Use default replacement fallback for invalid encoding. UnicodeEncoding enc = new UnicodeEncoding(true, false, false); // Define a string with various Unicode characters. string str1 = "AB YZ 19 \uD800\udc05 \u00e4"; str1 += "Unicode characters. \u00a9 \u010C s \u0062\u0308"; Console.WriteLine("Created original string...\n"); // Convert string to byte array. byte[] bytes = enc.GetBytes(str1); FileStream fs = File.Create(@".\characters.bin"); BinaryWriter bw = new BinaryWriter(fs); bw.Write(bytes); bw.Close(); // Read bytes from file. FileStream fsIn = File.OpenRead(@".\characters.bin"); BinaryReader br = new BinaryReader(fsIn); const int count = 10; // Number of bytes to read at a time. byte[] bytesRead = new byte[10]; // Buffer (byte array). int read; // Number of bytes actually read. string str2 = String.Empty; // Decoded string. // Try using Encoding object for all operations. do { read = br.Read(bytesRead, 0, count); str2 += enc.GetString(bytesRead, 0, read); } while (read == count); br.Close(); Console.WriteLine("Decoded string using UnicodeEncoding.GetString()..."); CompareForEquality(str1, str2); Console.WriteLine(); // Use Decoder for all operations. fsIn = File.OpenRead(@".\characters.bin"); br = new BinaryReader(fsIn); Decoder decoder = enc.GetDecoder(); char[] chars = new char[50]; int index = 0; // Next character to write in array. int written = 0; // Number of chars written to array. do { read = br.Read(bytesRead, 0, count); if (index + decoder.GetCharCount(bytesRead, 0, read) - 1 >= chars.Length) Array.Resize(ref chars, chars.Length + 50); written = decoder.GetChars(bytesRead, 0, read, chars, index); index += written; } while (read == count); br.Close(); // Instantiate a string with the decoded characters. string str3 = new String(chars, 0, index); Console.WriteLine("Decoded string using UnicodeEncoding.Decoder.GetString()..."); CompareForEquality(str1, str3); } private static void CompareForEquality(string original, string decoded) { bool result = original.Equals(decoded); Console.WriteLine("original = decoded: {0}", original.Equals(decoded, StringComparison.Ordinal)); if (! result) { Console.WriteLine("Code points in original string:"); foreach (var ch in original) Console.Write("{0} ", Convert.ToUInt16(ch).ToString("X4")); Console.WriteLine(); Console.WriteLine("Code points in decoded string:"); foreach (var ch in decoded) Console.Write("{0} ", Convert.ToUInt16(ch).ToString("X4")); Console.WriteLine(); } } } // The example displays the following output: // Created original string... // // Decoded string using UnicodeEncoding.GetString()... // original = decoded: False // Code points in original string: // 0041 0042 0020 0059 005A 0020 0031 0039 0020 D800 DC05 0020 00E4 0055 006E 0069 0063 006F // 0064 0065 0020 0063 0068 0061 0072 0061 0063 0074 0065 0072 0073 002E 0020 00A9 0020 010C // 0020 0073 0020 0062 0308 // Code points in decoded string: // 0041 0042 0020 0059 005A 0020 0031 0039 0020 FFFD FFFD 0020 00E4 0055 006E 0069 0063 006F // 0064 0065 0020 0063 0068 0061 0072 0061 0063 0074 0065 0072 0073 002E 0020 00A9 0020 010C // 0020 0073 0020 0062 0308 // // Decoded string using UnicodeEncoding.Decoder.GetString()... // original = decoded: True
有三种类型的回退策略:
-
最佳回退
-
替换回退
-
异常回退
|
|
|---|
|
只要有可能,,在实例化对象时,应指定编码对象使用的回退策略。 |
最佳回退
.GetEncoding(String) 重载检索代码页的默认和双字节字符集编码。
|
|
|---|
|
UTF32Encoding) 支持每个字符集中的每个字符,因此,它们可用于消除最佳回退问题。 |
默认情况下,此字符串是一个问号 (u+003f)。
DIGIT EIGHT 是不受支持的 INFINITY 字符的最差的替换,,并且 QUESTION MARK 指示映射为原始字符不可用。
Imports System.Text Module Example Public Sub Main() ' Get an encoding for code page 1252 (Western Europe character set). Dim cp1252 As Encoding = Encoding.GetEncoding(1252) ' Define and display a string. Dim str As String = String.Format("{0} {1} {2}", ChrW(&h24c8), ChrW(&H2075), ChrW(&h221E)) Console.WriteLine("Original string: " + str) Console.Write("Code points in string: ") For Each ch In str Console.Write("{0} ", Convert.ToUInt16(ch).ToString("X4")) Next Console.WriteLine() Console.WriteLine() ' Encode a Unicode string. Dim bytes() As Byte = cp1252.GetBytes(str) Console.Write("Encoded bytes: ") For Each byt In bytes Console.Write("{0:X2} ", byt) Next Console.WriteLine() Console.WriteLine() ' Decode the string. Dim str2 As String = cp1252.GetString(bytes) Console.WriteLine("String round-tripped: {0}", str.Equals(str2)) If Not str.Equals(str2) Then Console.WriteLine(str2) For Each ch In str2 Console.Write("{0} ", Convert.ToUInt16(ch).ToString("X4")) Next End If End Sub End Module ' The example displays the following output: ' Original string: Ⓢ ⁵ ∞ ' Code points in string: 24C8 0020 2075 0020 221E ' ' Encoded bytes: 3F 20 35 20 38 ' ' String round-tripped: False ' ? 5 8 ' 003F 0020 0035 0020 0038
using System; using System.Text; public class Example { public static void Main() { // Get an encoding for code page 1252 (Western Europe character set). Encoding cp1252 = Encoding.GetEncoding(1252); // Define and display a string. string str = "\u24c8 \u2075 \u221e"; Console.WriteLine("Original string: " + str); Console.Write("Code points in string: "); foreach (var ch in str) Console.Write("{0} ", Convert.ToUInt16(ch).ToString("X4")); Console.WriteLine("\n"); // Encode a Unicode string. Byte[] bytes = cp1252.GetBytes(str); Console.Write("Encoded bytes: "); foreach (byte byt in bytes) Console.Write("{0:X2} ", byt); Console.WriteLine("\n"); // Decode the string. string str2 = cp1252.GetString(bytes); Console.WriteLine("String round-tripped: {0}", str.Equals(str2)); if (! str.Equals(str2)) { Console.WriteLine(str2); foreach (var ch in str2) Console.Write("{0} ", Convert.ToUInt16(ch).ToString("X4")); } } } // The example displays the following output: // Original string: Ⓢ ⁵ ∞ // Code points in string: 24C8 0020 2075 0020 221E // // Encoded bytes: 3F 20 35 20 38 // // String round-tripped: False // ? 5 8 // 003F 0020 0035 0020 0038
例如,应用程序不应通过最佳编码对域名。
|
|
|---|
|
Implementing a Custom Fallback Strategy 。 |
以下各节包括替换每个字符不能映射代码用星号页面 1252 上的一个示例 (*)。
Imports System.Text Module Example Public Sub Main() Dim cp1252r As Encoding = Encoding.GetEncoding(1252, New EncoderReplacementFallback("*"), New DecoderReplacementFallback("*")) Dim str1 As String = String.Format("{0} {1} {2}", ChrW(&h24C8), ChrW(&h2075), ChrW(&h221E)) Console.WriteLine(str1) For Each ch In str1 Console.Write("{0} ", Convert.ToUInt16(ch).ToString("X4")) Next Console.WriteLine() Dim bytes() As Byte = cp1252r.GetBytes(str1) Dim str2 As String = cp1252r.GetString(bytes) Console.WriteLine("Round-trip: {0}", str1.Equals(str2)) If Not str1.Equals(str2) Then Console.WriteLine(str2) For Each ch In str2 Console.Write("{0} ", Convert.ToUInt16(ch).ToString("X4")) Next Console.WriteLine() End If End Sub End Module ' The example displays the following output: ' Ⓢ ⁵ ∞ ' 24C8 0020 2075 0020 221E ' Round-trip: False ' * * * ' 002A 0020 002A 0020 002A
using System; using System.Text; public class Example { public static void Main() { Encoding cp1252r = Encoding.GetEncoding(1252, new EncoderReplacementFallback("*"), new DecoderReplacementFallback("*")); string str1 = "\u24C8 \u2075 \u221E"; Console.WriteLine(str1); foreach (var ch in str1) Console.Write("{0} ", Convert.ToUInt16(ch).ToString("X4")); Console.WriteLine(); byte[] bytes = cp1252r.GetBytes(str1); string str2 = cp1252r.GetString(bytes); Console.WriteLine("Round-trip: {0}", str1.Equals(str2)); if (! str1.Equals(str2)) { Console.WriteLine(str2); foreach (var ch in str2) Console.Write("{0} ", Convert.ToUInt16(ch).ToString("X4")); Console.WriteLine(); } } } // The example displays the following output: // Ⓢ ⁵ ∞ // 24C8 0020 2075 0020 221E // Round-trip: False // * * * // 002A 0020 002A 0020 002A
替换回退
如输出所示,无法解码为 ASCII 字节值的每个字符被 0x3F 替换,是问号的 ASCII 代码。
Imports System.Text Module Example Public Sub Main() Dim enc As Encoding = Encoding.Ascii Dim str1 As String = String.Format("{0} {1} {2}", ChrW(&h24C8), ChrW(&h2075), ChrW(&h221E)) Console.WriteLine(str1) For Each ch In str1 Console.Write("{0} ", Convert.ToUInt16(ch).ToString("X4")) Next Console.WriteLine() Console.WriteLine() ' Encode the original string using the ASCII encoder. Dim bytes() As Byte = enc.GetBytes(str1) Console.Write("Encoded bytes: ") For Each byt In bytes Console.Write("{0:X2} ", byt) Next Console.WriteLine() Console.WriteLine() ' Decode the ASCII bytes. Dim str2 As String = enc.GetString(bytes) Console.WriteLine("Round-trip: {0}", str1.Equals(str2)) If Not str1.Equals(str2) Then Console.WriteLine(str2) For Each ch In str2 Console.Write("{0} ", Convert.ToUInt16(ch).ToString("X4")) Next Console.WriteLine() End If End Sub End Module ' The example displays the following output: ' Ⓢ ⁵ ∞ ' 24C8 0020 2075 0020 221E ' ' Encoded bytes: 3F 20 3F 20 3F ' ' Round-trip: False ' ? ? ? ' 003F 0020 003F 0020 003F
using System; using System.Text; public class Example { public static void Main() { Encoding enc = Encoding.ASCII; string str1 = "\u24C8 \u2075 \u221E"; Console.WriteLine(str1); foreach (var ch in str1) Console.Write("{0} ", Convert.ToUInt16(ch).ToString("X4")); Console.WriteLine("\n"); // Encode the original string using the ASCII encoder. byte[] bytes = enc.GetBytes(str1); Console.Write("Encoded bytes: "); foreach (var byt in bytes) Console.Write("{0:X2} ", byt); Console.WriteLine("\n"); // Decode the ASCII bytes. string str2 = enc.GetString(bytes); Console.WriteLine("Round-trip: {0}", str1.Equals(str2)); if (! str1.Equals(str2)) { Console.WriteLine(str2); foreach (var ch in str2) Console.Write("{0} ", Convert.ToUInt16(ch).ToString("X4")); Console.WriteLine(); } } } // The example displays the following output: // Ⓢ ⁵ ∞ // 24C8 0020 2075 0020 221E // // Encoded bytes: 3F 20 3F 20 3F // // Round-trip: False // ? ? ? // 003F 0020 003F 0020 003F
EncoderReplacementFallback 对象更改代码页 1252 编码器的行为 (*) 用作替换字符串。
Imports System.Text Module Example Public Sub Main() Dim cp1252r As Encoding = Encoding.GetEncoding(1252, New EncoderReplacementFallback("*"), New DecoderReplacementFallback("*")) Dim str1 As String = String.Format("{0} {1} {2}", ChrW(&h24C8), ChrW(&h2075), ChrW(&h221E)) Console.WriteLine(str1) For Each ch In str1 Console.Write("{0} ", Convert.ToUInt16(ch).ToString("X4")) Next Console.WriteLine() Dim bytes() As Byte = cp1252r.GetBytes(str1) Dim str2 As String = cp1252r.GetString(bytes) Console.WriteLine("Round-trip: {0}", str1.Equals(str2)) If Not str1.Equals(str2) Then Console.WriteLine(str2) For Each ch In str2 Console.Write("{0} ", Convert.ToUInt16(ch).ToString("X4")) Next Console.WriteLine() End If End Sub End Module ' The example displays the following output: ' Ⓢ ⁵ ∞ ' 24C8 0020 2075 0020 221E ' Round-trip: False ' * * * ' 002A 0020 002A 0020 002A
using System; using System.Text; public class Example { public static void Main() { Encoding cp1252r = Encoding.GetEncoding(1252, new EncoderReplacementFallback("*"), new DecoderReplacementFallback("*")); string str1 = "\u24C8 \u2075 \u221E"; Console.WriteLine(str1); foreach (var ch in str1) Console.Write("{0} ", Convert.ToUInt16(ch).ToString("X4")); Console.WriteLine(); byte[] bytes = cp1252r.GetBytes(str1); string str2 = cp1252r.GetString(bytes); Console.WriteLine("Round-trip: {0}", str1.Equals(str2)); if (! str1.Equals(str2)) { Console.WriteLine(str2); foreach (var ch in str2) Console.Write("{0} ", Convert.ToUInt16(ch).ToString("X4")); Console.WriteLine(); } } } // The example displays the following output: // Ⓢ ⁵ ∞ // 24C8 0020 2075 0020 221E // Round-trip: False // * * * // 002A 0020 002A 0020 002A
|
|
|---|
|
Implementing a Custom Fallback Strategy 。 |
但是,您可以自由选择任何替换字符串,因此,它可以包含多个字符。
异常回退
ASCIIEncoding 类的异常回退。
Imports System.Text Module Example Public Sub Main() Dim enc As Encoding = Encoding.GetEncoding("us-ascii", New EncoderExceptionFallback(), New DecoderExceptionFallback()) Dim str1 As String = String.Format("{0} {1} {2}", ChrW(&h24C8), ChrW(&h2075), ChrW(&h221E)) Console.WriteLine(str1) For Each ch In str1 Console.Write("{0} ", Convert.ToUInt16(ch).ToString("X4")) Next Console.WriteLine() Console.WriteLine() ' Encode the original string using the ASCII encoder. Dim bytes() As Byte = {} Try bytes = enc.GetBytes(str1) Console.Write("Encoded bytes: ") For Each byt In bytes Console.Write("{0:X2} ", byt) Next Console.WriteLine() Catch e As EncoderFallbackException Console.Write("Exception: ") If e.IsUnknownSurrogate() Then Console.WriteLine("Unable to encode surrogate pair 0x{0:X4} 0x{1:X3} at index {2}.", Convert.ToUInt16(e.CharUnknownHigh), Convert.ToUInt16(e.CharUnknownLow), e.Index) Else Console.WriteLine("Unable to encode 0x{0:X4} at index {1}.", Convert.ToUInt16(e.CharUnknown), e.Index) End If Exit Sub End Try Console.WriteLine() ' Decode the ASCII bytes. Try Dim str2 As String = enc.GetString(bytes) Console.WriteLine("Round-trip: {0}", str1.Equals(str2)) If Not str1.Equals(str2) Then Console.WriteLine(str2) For Each ch In str2 Console.Write("{0} ", Convert.ToUInt16(ch).ToString("X4")) Next Console.WriteLine() End If Catch e As DecoderFallbackException Console.Write("Unable to decode byte(s) ") For Each unknown As Byte In e.BytesUnknown Console.Write("0x{0:X2} ") Next Console.WriteLine("at index {0}", e.Index) End Try End Sub End Module ' The example displays the following output: ' Ⓢ ⁵ ∞ ' 24C8 0020 2075 0020 221E ' ' Exception: Unable to encode 0x24C8 at index 0.
using System; using System.Text; public class Example { public static void Main() { Encoding enc = Encoding.GetEncoding("us-ascii", new EncoderExceptionFallback(), new DecoderExceptionFallback()); string str1 = "\u24C8 \u2075 \u221E"; Console.WriteLine(str1); foreach (var ch in str1) Console.Write("{0} ", Convert.ToUInt16(ch).ToString("X4")); Console.WriteLine("\n"); // Encode the original string using the ASCII encoder. byte[] bytes = {}; try { bytes = enc.GetBytes(str1); Console.Write("Encoded bytes: "); foreach (var byt in bytes) Console.Write("{0:X2} ", byt); Console.WriteLine(); } catch (EncoderFallbackException e) { Console.Write("Exception: "); if (e.IsUnknownSurrogate()) Console.WriteLine("Unable to encode surrogate pair 0x{0:X4} 0x{1:X3} at index {2}.", Convert.ToUInt16(e.CharUnknownHigh), Convert.ToUInt16(e.CharUnknownLow), e.Index); else Console.WriteLine("Unable to encode 0x{0:X4} at index {1}.", Convert.ToUInt16(e.CharUnknown), e.Index); return; } Console.WriteLine(); // Decode the ASCII bytes. try { string str2 = enc.GetString(bytes); Console.WriteLine("Round-trip: {0}", str1.Equals(str2)); if (! str1.Equals(str2)) { Console.WriteLine(str2); foreach (var ch in str2) Console.Write("{0} ", Convert.ToUInt16(ch).ToString("X4")); Console.WriteLine(); } } catch (DecoderFallbackException e) { Console.Write("Unable to decode byte(s) "); foreach (byte unknown in e.BytesUnknown) Console.Write("0x{0:X2} "); Console.WriteLine("at index {0}", e.Index); } } } // The example displays the following output: // Ⓢ ⁵ ∞ // 24C8 0020 2075 0020 221E // // Exception: Unable to encode 0x24C8 at index 0.
|
|
|---|
|
Implementing a Custom Fallback Strategy 。 |
DecoderFallbackException 对象提供有关导致异常的条件的信息:
-
.Index 属性指示在第一个字符无法编码的字符串的位置。
-
.Index 属性指示未知字节的起始位置。
因此,它们不允许无效数据在编码中替换或更正或解码方法。
除了最佳映射以外通过代码页在内部实现, .NET framework 包括实现回退策略以下类:
-
EncoderReplacementFallbackBuffer 来替换编码操作中的字符。
-
DecoderReplacementFallbackBuffer 来替换解码操作中的字符。
-
EncoderFallbackException 。
-
DecoderFallbackException 。
此外,您还可以按照以下步骤实现使用最佳回退、替换回退或异常回退的自定义解决方案,例如:
-
DecoderFallback 。
-
DecoderFallbackBuffer 。
-
ArgumentException。
从 EncoderFallback 或 DecoderFallback 派生
.GetEncoding(String, EncoderFallback, DecoderFallback) 方法并在编码类和回退实现之间的中间。
在创建编码器或解码器时自定义回退解决方案,必须实现以下成员:
-
对于自定义异常回退,其值为零。
-
方法由编码器调用,在解码器遇到第一个字符不能成功进入时,或者,在遇到第一个字节时不能成功解码。
从 EncoderFallbackBuffer 或 DecoderFallbackBuffer 派生
每个实例表示包含回退字符将替换字符不能输入或字节顺序无法解码的缓冲区。
在创建编码器或解码器时自定义回退解决方案,必须实现以下成员:
-
对于异常回退,回退方法应引发异常。
-
在返回所有回退字符时,此方法应返回 U+0000。
-
.Remaining 属性,该属性返回回退缓冲区中的剩余字符数。
-
.MovePrevious 方法,将回退缓冲区中的当前位置移动到前一个字符。
-
.Reset 方法,重新初始化回退缓冲区。
DecoderFallbackBuffer 派生的类还维护两个私有实例字段:字符的确切的缓冲区的;下一个字符的索引。返回的缓冲区。
EncoderFallback 示例
下面的示例使用自定义最佳回退实现来提供非 ASCII 字符的更好映射。
MaxCharCount 属性返回 3。
Public Class CustomMapper : Inherits EncoderFallback Public DefaultString As String Friend mapping As Dictionary(Of UShort, ULong) Public Sub New() Me.New("?") End Sub Public Sub New(ByVal defaultString As String) Me.DefaultString = defaultString ' Create table of mappings mapping = New Dictionary(Of UShort, ULong) mapping.Add(&H24C8, &H53) mapping.Add(&H2075, &H35) mapping.Add(&H221E, &H49004E0046) End Sub Public Overrides Function CreateFallbackBuffer() As System.Text.EncoderFallbackBuffer Return New CustomMapperFallbackBuffer(Me) End Function Public Overrides ReadOnly Property MaxCharCount As Integer Get Return 3 End Get End Property End Class
public class CustomMapper : EncoderFallback { public string DefaultString; internal Dictionary<ushort, ulong> mapping; public CustomMapper() : this("*") { } public CustomMapper(string defaultString) { this.DefaultString = defaultString; // Create table of mappings mapping = new Dictionary<ushort, ulong>(); mapping.Add(0x24C8, 0x53); mapping.Add(0x2075, 0x35); mapping.Add(0x221E, 0x49004E0046); } public override EncoderFallbackBuffer CreateFallbackBuffer() { return new CustomMapperFallbackBuffer(this); } public override int MaxCharCount { get { return 3; } } }
charsToReturn的位置,返回的下一个字符。
Public Class CustomMapperFallbackBuffer : Inherits EncoderFallbackBuffer Dim count As Integer = -1 ' Number of characters to return Dim index As Integer = -1 ' Index of character to return Dim fb As CustomMapper Dim charsToReturn As String Public Sub New(ByVal fallback As CustomMapper) MyBase.New() Me.fb = fallback End Sub Public Overloads Overrides Function Fallback(ByVal charUnknownHigh As Char, ByVal charUnknownLow As Char, ByVal index As Integer) As Boolean ' Do not try to map surrogates to ASCII. Return False End Function Public Overloads Overrides Function Fallback(ByVal charUnknown As Char, ByVal index As Integer) As Boolean ' Return false if there are already characters to map. If count >= 1 Then Return False ' Determine number of characters to return. charsToReturn = String.Empty Dim key As UShort = Convert.ToUInt16(charUnknown) If fb.mapping.ContainsKey(key) Then Dim bytes() As Byte = BitConverter.GetBytes(fb.mapping.Item(key)) Dim ctr As Integer For Each byt In bytes If byt > 0 Then ctr += 1 charsToReturn += Chr(byt) End If Next count = ctr Else ' Return default. charsToReturn = fb.DefaultString count = 1 End If Me.index = charsToReturn.Length - 1 Return True End Function Public Overrides Function GetNextChar() As Char ' We'll return a character if possible, so subtract from the count of chars to return. count -= 1 ' If count is less than zero, we've returned all characters. If count < 0 Then Return ChrW(0) Me.index -= 1 Return charsToReturn(Me.index + 1) End Function Public Overrides Function MovePrevious() As Boolean ' Original: if count >= -1 and pos >= 0 If count >= -1 Then count += 1 Return True Else Return False End If End Function Public Overrides ReadOnly Property Remaining As Integer Get Return If(count < 0, 0, count) End Get End Property Public Overrides Sub Reset() count = -1 index = -1 End Sub End Class
public class CustomMapperFallbackBuffer : EncoderFallbackBuffer { int count = -1; // Number of characters to return int index = -1; // Index of character to return CustomMapper fb; string charsToReturn; public CustomMapperFallbackBuffer(CustomMapper fallback) { this.fb = fallback; } public override bool Fallback(char charUnknownHigh, char charUnknownLow, int index) { // Do not try to map surrogates to ASCII. return false; } public override bool Fallback(char charUnknown, int index) { // Return false if there are already characters to map. if (count >= 1) return false; // Determine number of characters to return. charsToReturn = String.Empty; ushort key = Convert.ToUInt16(charUnknown); if (fb.mapping.ContainsKey(key)) { byte[] bytes = BitConverter.GetBytes(fb.mapping[key]); int ctr = 0; foreach (var byt in bytes) { if (byt > 0) { ctr++; charsToReturn += (char) byt; } } count = ctr; } else { // Return default. charsToReturn = fb.DefaultString; count = 1; } this.index = charsToReturn.Length - 1; return true; } public override char GetNextChar() { // We'll return a character if possible, so subtract from the count of chars to return. count--; // If count is less than zero, we've returned all characters. if (count < 0) return '\u0000'; this.index--; return charsToReturn[this.index + 1]; } public override bool MovePrevious() { // Original: if count >= -1 and pos >= 0 if (count >= -1) { count++; return true; } else { return false; } } public override int Remaining { get { return count < 0 ? 0 : count; } } public override void Reset() { count = -1; index = -1; } }
该输出指示最佳回退实现成功处理在原始字符串的三个非 ASCII 字符。
Imports System.Text Imports System.Collections.Generic Module Module1 Sub Main() Dim enc As Encoding = Encoding.GetEncoding("us-ascii", New CustomMapper(), New DecoderExceptionFallback()) Dim str1 As String = String.Format("{0} {1} {2}", ChrW(&H24C8), ChrW(&H2075), ChrW(&H221E)) Console.WriteLine(str1) For ctr As Integer = 0 To str1.Length - 1 Console.Write("{0} ", Convert.ToUInt16(str1(ctr)).ToString("X4")) If ctr = str1.Length - 1 Then Console.WriteLine() Next Console.WriteLine() ' Encode the original string using the ASCII encoder. Dim bytes() As Byte = enc.GetBytes(str1) Console.Write("Encoded bytes: ") For Each byt In bytes Console.Write("{0:X2} ", byt) Next Console.WriteLine() Console.WriteLine() ' Decode the ASCII bytes. Dim str2 As String = enc.GetString(bytes) Console.WriteLine("Round-trip: {0}", str1.Equals(str2)) If Not str1.Equals(str2) Then Console.WriteLine(str2) For Each ch In str2 Console.Write("{0} ", Convert.ToUInt16(ch).ToString("X4")) Next Console.WriteLine() End If End Sub End Module
using System; using System.Collections.Generic; using System.Text; class Program { static void Main() { Encoding enc = Encoding.GetEncoding("us-ascii", new CustomMapper(), new DecoderExceptionFallback()); string str1 = "\u24C8 \u2075 \u221E"; Console.WriteLine(str1); for (int ctr = 0; ctr <= str1.Length - 1; ctr++) { Console.Write("{0} ", Convert.ToUInt16(str1[ctr]).ToString("X4")); if (ctr == str1.Length - 1) Console.WriteLine(); } Console.WriteLine(); // Encode the original string using the ASCII encoder. byte[] bytes = enc.GetBytes(str1); Console.Write("Encoded bytes: "); foreach (var byt in bytes) Console.Write("{0:X2} ", byt); Console.WriteLine("\n"); // Decode the ASCII bytes. string str2 = enc.GetString(bytes); Console.WriteLine("Round-trip: {0}", str1.Equals(str2)); if (! str1.Equals(str2)) { Console.WriteLine(str2); foreach (var ch in str2) Console.Write("{0} ", Convert.ToUInt16(ch).ToString("X4")); Console.WriteLine(); } } }