【问题标题】:java does not convert from latin to utf8 properlyjava 不能正确地从拉丁语转换为 utf8
【发布时间】:2012-12-27 11:56:27
【问题描述】:

我从mysql中选择数据,数据库不是utf8(unicode字符另存为latin,例如unicode字符串Đỗ Tiến(正确形式)另存为Äá»— Tiến)。如果我使用PHP回显到html,我只需设置<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />网页显示正确。 如果我没有设置meta标签,Chrome打开时,Chrome检测到windows-1258编码,手动改成Unicode(utf-8),网页正常显示。

问题是:当我使用 jdbc 从 mysql 中选择数据时,我会这样转换:

    byte[] asciiBytes1 = "Äá»— tiến".getBytes("Cp1258");
    byte[] asciiBytes2 = "Äá»— tiến".getBytes("ISO-8859-1");
    String unicode1 = new String(asciiBytes1, "UTF-8");
    String unicode2 = new String(asciiBytes2, "UTF-8");
    System.out.println(unicode1);//�?ỗ tiến
    System.out.println(unicode2);//Đ�? tiến

结果,java 没有正确转换,我在http://docs.oracle.com/javase/1.4.2/docs/guide/intl/encoding.doc.html 中尝试了许多编码,不仅是 Cp1258 和 ISO-8859-1,但没有一个有效。 2个简单的转换方法是使用我之前提到的带有Äá»— tiến字符串的html文件或使用notepad ++,设置编码ANSI,粘贴Äá»— tiến字符串然后更改为utf-8,它将显示Đỗ Tiến(是正确的字符串我要)

【问题讨论】:

    标签: java mysql utf-8 character-encoding codepages


    【解决方案1】:

    这有点复杂,它在修改后的 Windows-1252 中,其中 0x81、0x8d、0x8f、0x90 和 0x9d 是正常的 未分配的将替换为相应的 C1 字符。默认情况下Java似乎没有考虑到这一点 使用 Windows-1252 时。

    最简单的方法是修复您的数据库并在任何地方使用 UTF-8。

    这是代码

    public static byte[] getBytesModifiedW1252( String str ) {
        final int[] windows1252 = {
                0x0000,0x0001,0x0002,0x0003,0x0004,0x0005,0x0006,0x0007,0x0008,0x0009,0x000A,0x000B,0x000C,0x000D,0x000E,0x000F
                ,0x0010,0x0011,0x0012,0x0013,0x0014,0x0015,0x0016,0x0017,0x0018,0x0019,0x001A,0x001B,0x001C,0x001D,0x001E,0x001F
                ,0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0027,0x0028,0x0029,0x002A,0x002B,0x002C,0x002D,0x002E,0x002F
                ,0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039,0x003A,0x003B,0x003C,0x003D,0x003E,0x003F
                ,0x0040,0x0041,0x0042,0x0043,0x0044,0x0045,0x0046,0x0047,0x0048,0x0049,0x004A,0x004B,0x004C,0x004D,0x004E,0x004F
                ,0x0050,0x0051,0x0052,0x0053,0x0054,0x0055,0x0056,0x0057,0x0058,0x0059,0x005A,0x005B,0x005C,0x005D,0x005E,0x005F
                ,0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067,0x0068,0x0069,0x006A,0x006B,0x006C,0x006D,0x006E,0x006F
                ,0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077,0x0078,0x0079,0x007A,0x007B,0x007C,0x007D,0x007E,0x007F
                ,0x20AC,0x0081,0x201A,0x0192,0x201E,0x2026,0x2020,0x2021,0x02C6,0x2030,0x0160,0x2039,0x0152,0x008D,0x017D,0x008F
                ,0x0090,0x2018,0x2019,0x201C,0x201D,0x2022,0x2013,0x2014,0x02DC,0x2122,0x0161,0x203A,0x0153,0x009D,0x017E,0x0178
                ,0x00A0,0x00A1,0x00A2,0x00A3,0x00A4,0x00A5,0x00A6,0x00A7,0x00A8,0x00A9,0x00AA,0x00AB,0x00AC,0x00AD,0x00AE,0x00AF
                ,0x00B0,0x00B1,0x00B2,0x00B3,0x00B4,0x00B5,0x00B6,0x00B7,0x00B8,0x00B9,0x00BA,0x00BB,0x00BC,0x00BD,0x00BE,0x00BF
                ,0x00C0,0x00C1,0x00C2,0x00C3,0x00C4,0x00C5,0x00C6,0x00C7,0x00C8,0x00C9,0x00CA,0x00CB,0x00CC,0x00CD,0x00CE,0x00CF
                ,0x00D0,0x00D1,0x00D2,0x00D3,0x00D4,0x00D5,0x00D6,0x00D7,0x00D8,0x00D9,0x00DA,0x00DB,0x00DC,0x00DD,0x00DE,0x00DF
                ,0x00E0,0x00E1,0x00E2,0x00E3,0x00E4,0x00E5,0x00E6,0x00E7,0x00E8,0x00E9,0x00EA,0x00EB,0x00EC,0x00ED,0x00EE,0x00EF
                ,0x00F0,0x00F1,0x00F2,0x00F3,0x00F4,0x00F5,0x00F6,0x00F7,0x00F8,0x00F9,0x00FA,0x00FB,0x00FC,0x00FD,0x00FE,0x00FF
            };
        Map<Integer, Integer> map = new HashMap<Integer, Integer>();
    
        for( int i = 0; i < windows1252.length; ++i ) {
            map.put( windows1252[i], i);
        }
        byte replacement = (byte)0x003F;
    
        byte[] ret = new byte[str.length()];
    
        for( int i = 0; i < str.length(); ++i ) {
            int cp = str.charAt(i);
            Integer w1252 = map.get(cp);
            ret[i] = w1252 == null ? replacement : (byte)(int)w1252;
        }
    
        return ret;
    }
    
    public static void main(String args[]) throws UnsupportedEncodingException {
        byte[] bytes = getBytesModifiedW1252( "Äá»— tiến" );
        System.out.println(new String(bytes, "UTF-8"));
        //Đỗ tiến
    }
    

    这是相反的:

    public static String getStringModifiedW1252( byte[] bytes ) {
    
        final int[] windows1252 = {
                0x0000,0x0001,0x0002,0x0003,0x0004,0x0005,0x0006,0x0007,0x0008,0x0009,0x000A,0x000B,0x000C,0x000D,0x000E,0x000F
                ,0x0010,0x0011,0x0012,0x0013,0x0014,0x0015,0x0016,0x0017,0x0018,0x0019,0x001A,0x001B,0x001C,0x001D,0x001E,0x001F
                ,0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0027,0x0028,0x0029,0x002A,0x002B,0x002C,0x002D,0x002E,0x002F
                ,0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039,0x003A,0x003B,0x003C,0x003D,0x003E,0x003F
                ,0x0040,0x0041,0x0042,0x0043,0x0044,0x0045,0x0046,0x0047,0x0048,0x0049,0x004A,0x004B,0x004C,0x004D,0x004E,0x004F
                ,0x0050,0x0051,0x0052,0x0053,0x0054,0x0055,0x0056,0x0057,0x0058,0x0059,0x005A,0x005B,0x005C,0x005D,0x005E,0x005F
                ,0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067,0x0068,0x0069,0x006A,0x006B,0x006C,0x006D,0x006E,0x006F
                ,0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077,0x0078,0x0079,0x007A,0x007B,0x007C,0x007D,0x007E,0x007F
                ,0x20AC,0x0081,0x201A,0x0192,0x201E,0x2026,0x2020,0x2021,0x02C6,0x2030,0x0160,0x2039,0x0152,0x008D,0x017D,0x008F
                ,0x0090,0x2018,0x2019,0x201C,0x201D,0x2022,0x2013,0x2014,0x02DC,0x2122,0x0161,0x203A,0x0153,0x009D,0x017E,0x0178
                ,0x00A0,0x00A1,0x00A2,0x00A3,0x00A4,0x00A5,0x00A6,0x00A7,0x00A8,0x00A9,0x00AA,0x00AB,0x00AC,0x00AD,0x00AE,0x00AF
                ,0x00B0,0x00B1,0x00B2,0x00B3,0x00B4,0x00B5,0x00B6,0x00B7,0x00B8,0x00B9,0x00BA,0x00BB,0x00BC,0x00BD,0x00BE,0x00BF
                ,0x00C0,0x00C1,0x00C2,0x00C3,0x00C4,0x00C5,0x00C6,0x00C7,0x00C8,0x00C9,0x00CA,0x00CB,0x00CC,0x00CD,0x00CE,0x00CF
                ,0x00D0,0x00D1,0x00D2,0x00D3,0x00D4,0x00D5,0x00D6,0x00D7,0x00D8,0x00D9,0x00DA,0x00DB,0x00DC,0x00DD,0x00DE,0x00DF
                ,0x00E0,0x00E1,0x00E2,0x00E3,0x00E4,0x00E5,0x00E6,0x00E7,0x00E8,0x00E9,0x00EA,0x00EB,0x00EC,0x00ED,0x00EE,0x00EF
                ,0x00F0,0x00F1,0x00F2,0x00F3,0x00F4,0x00F5,0x00F6,0x00F7,0x00F8,0x00F9,0x00FA,0x00FB,0x00FC,0x00FD,0x00FE,0x00FF
            };
    
        StringBuilder ret = new StringBuilder(bytes.length);
    
        for( int i = 0; i < bytes.length; ++i ) {
            ret.append( (char) windows1252[(bytes[i] < 0 ? 256 + bytes[i] : bytes[i] )] );
        }
    
        return ret.toString();
    
    }
    
    public static void main(String args[]) throws UnsupportedEncodingException {
        String str = "Đỗ tiến";
        String w1252 = getStringModifiedW1252( str.getBytes("UTF-8"));
        System.out.println(w1252);
        //Äá»— tiến
    }
    

    您可能希望将地图和数组存放在某处,而不是在调用方法时创建它们

    【讨论】:

    • 我无法编辑数据库,因为它不在我的许可范围内,但是有什么方法不使用 java-processing,例如通过 sql 命令转换?
    • @yelliver 好吧,您可以在 jbdc 连接字符串中尝试"jdbc:mysql://hostName/dbName?characterEncoding=UTF-8"
    • 它确实有效,因为我的数据库不是 utf8 格式(我注意到了)我现在对您的解决方案感到满意您能否提供我在 sql 中查询 LIKE 的反向解决方案(从 utf8 字符串转换修改 windows1252 字符串)?
    • @yelliver 你的意思是如何将"Đỗ tiến" 转换为"Äá»— tiến"
    • @yelliver 好了我已经添加了逆向操作的代码,希望对你有帮助
    【解决方案2】:

    试试这个

    byte[] asciiBytes1 = "Äá»— tiến".getBytes("Cp1258");
    byte[] asciiBytes2 = "Äá»— tiến".getBytes("ISO-8859-1");
    String unicode1 = new String(asciiBytes1, "Cp1258");
    String unicode2 = new String(asciiBytes2, "ISO-8859-1");
    System.out.println(unicode1);//�?ỗ tiến
    System.out.println(unicode2);//Đ�? tiến
    

    【讨论】:

      猜你喜欢
      • 2011-02-28
      • 2010-11-28
      • 2012-03-12
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 2015-10-23
      • 2016-02-17
      • 2011-07-27
      相关资源
      最近更新 更多