【发布时间】:2023-03-18 02:41:01
【问题描述】:
在 java 中,不使用 Integer.parseInt 将子字符串转换为整数的最快方法是什么?我想知道是否有办法避免使用 parseInt,因为它需要我创建一个临时字符串,该字符串是我要转换的子字符串的副本。
"abcd12345abcd"我想通过不使用子字符串来避免创建新的临时字符串。
如果我自己动手,有没有办法避免我在String.charAt(int) 中看到的数组边界检查的开销?
编辑
我从每个人那里得到了很多很好的信息......以及关于预优化的常见警告:) 基本答案是没有什么比 String.charAt 或 char[] 更好的了。不安全的代码即将被淘汰(也许)。编译器很可能可以优化掉对 [] 的过度范围检查。
我做了一些基准测试,由于不使用子字符串和滚动特定的 parseInt 而节省的费用是巨大的。
调用 Integer.parseInt(str.substring(4,8)) 的成本的 32% 来自子字符串。这不包括后续的垃圾收集成本。
Integer.parseInt 旨在处理非常广泛的输入。通过使用 charAt 滚动我自己的 parseInt(特定于我们的数据的样子),我能够在 substring 方法上实现 6 倍的加速。
尝试 char[] 的评论导致性能提升约 7 倍。但是,您的数据必须已经在 char[] 中,因为转换为 char 数组的成本很高。对于解析文本,完全留在 char[] 中并编写一些函数来比较字符串似乎是有意义的。
基准测试结果(越小越快):
parseInt(substring) 23731665
parseInt(string) 16859226
Atoi1 7116633
Atoi2 4514031
Atoi3 char[] 4135355
Atoi4 char[] 3503638
Atoi5 char[] 5485495
GetNumber1 8666020
GetNumber2 5951939
在基准测试期间,我还试验了 Inline 的开启和关闭,并验证编译器正确地内联了所有内容。
如果有人关心,这是我的基准测试代码...
package javaatoi;
import java.lang.management.GarbageCollectorMXBean;
import java.lang.management.ManagementFactory;
public class JavaAtoi {
static int cPasses = 10;
static int cTests = 9;
static int cIter = 0x100000;
static int cString = 0x100;
static int fStringMask = cString - 1;
public static void main(String[] args) throws InterruptedException {
// setup test data. Use a large enough set that the compiler
// wont unroll the loop. Use a small enough set that we are
// keeping the data in L2. I don't want to measure memory loads.
String[] a = new String[cString];
for (int i = 0 ; i< cString ; i+=4) {
// leading zeros will occur, so add one number with one.
a[i+0] = "abcd01234abcd";
a[i+1] = "abcd1234abcd";
a[i+2] = "abcd1234abcd";
a[i+3] = "abcd1234abcd";
}
// array of pre-substringed stuff
String[] a1 = new String[cString];
for (int i=0 ; i< cString ; ++i)
a1[i]= a[i].substring(4,8);
// char array version of the strings
char[][] b = new char[cString][];
for (int i =0 ; i<cString ; ++i)
b[i] = a[i].toCharArray();
// array to hold times for each test for each pass
long[][] t = new long[cPasses][cTests];
// multiple dry runs to let the compiler optimize the functions
for (int i=0 ; i<50 ; ++i) {
t[0][0] = TestParseInt1(a)[0];
t[0][1] = TestParseInt2(a1)[0];
t[0][2] = TestAtoi1(a)[0];
t[0][3] = TestAtoi2(a)[0];
t[0][4] = TestAtoi3(b)[0];
t[0][5] = TestAtoi4(b)[0];
t[0][6] = TestAtoi5(b)[0];
t[0][7] = TestAtoi6(a)[0];
t[0][8] = TestAtoi7(a)[0];
}
// now do a bunch of tests
for (int i=0 ; i<cPasses ; ++i) {
t[i][0] = TestParseInt1(a)[0];
t[i][1] = TestParseInt2(a1)[0];
t[i][2] = TestAtoi1(a)[0];
t[i][3] = TestAtoi2(a)[0];
t[i][4] = TestAtoi3(b)[0];
t[i][5] = TestAtoi4(b)[0];
t[i][6] = TestAtoi5(b)[0];
t[i][7] = TestAtoi6(a)[0];
t[i][8] = TestAtoi7(a)[0];
}
// setup mins - we only care about min time.
t[cPasses-1] = new long[cTests];
for (int i=0 ; i<cTests ; ++i)
t[cPasses-1][i] = 999999999;
for (int j=0 ; j<cTests ; ++j) {
for (int i=0 ; i<cPasses-1 ; ++i) {
long n = t[i][j];
if (n < t[cPasses-1][j])
t[cPasses-1][j] = n;
}
}
// output string
String s = new String();
for (int j=0 ; j<cTests ; ++j) {
for (int i=0 ; i<cPasses ; ++i) {
long n = t[i][j];
s += String.format("%9d", n);
}
s += "\n";
}
System.out.println(s);
// if you comment out the part of TestParseInt1 you can sorta see the
// gc cost.
System.gc(); // Trying to get an idea of the total substring cost
Thread.sleep(1000); // i dunno if this matters. Seems like the gc takes a little while. Not real exact...
long collectionTime = 0;
for (GarbageCollectorMXBean garbageCollectorMXBean : ManagementFactory.getGarbageCollectorMXBeans()) {
long n = garbageCollectorMXBean.getCollectionTime();
if (n > 0)
collectionTime += n;
}
System.out.println(collectionTime*1000000);
}
// you have to put each test function in its own wrapper to
// get the compiler to fairly optimize each test.
// I also made sure I incremented n and used a large # of string
// to make it harder for the compiler to eliminate the loops.
static long[] TestParseInt1(String[] a) {
long n = 0;
long startTime = System.nanoTime();
// comment this out to get an idea of gc cost without the substrings
// then uncomment to get idea of gc cost with substrings
for (int i=0 ; i<cIter ; ++i)
n += Integer.parseInt(a[i&fStringMask].substring(4,8));
return new long[] { System.nanoTime() - startTime, n };
}
static long[] TestParseInt2(String[] a) {
long n = 0;
long startTime = System.nanoTime();
for (int i=0 ; i<cIter ; ++i)
n += Integer.parseInt(a[i&fStringMask]);
return new long[] { System.nanoTime() - startTime, n };
}
static long[] TestAtoi1(String[] a) {
long n = 0;
long startTime = System.nanoTime();
for (int i=0 ; i<cIter ; ++i)
n += Atoi1(a[i&fStringMask], 4, 4);
return new long[] { System.nanoTime() - startTime, n };
}
static long[] TestAtoi2(String[] a) {
long n = 0;
long startTime = System.nanoTime();
for (int i=0 ; i<cIter ; ++i)
n += Atoi2(a[i&fStringMask], 4, 4);
return new long[] { System.nanoTime() - startTime, n };
}
static long[] TestAtoi3(char[][] a) {
long n = 0;
long startTime = System.nanoTime();
for (int i=0 ; i<cIter ; ++i)
n += Atoi3(a[i&fStringMask], 4, 4);
return new long[] { System.nanoTime() - startTime, n };
}
static long[] TestAtoi4(char[][] a) {
long n = 0;
long startTime = System.nanoTime();
for (int i=0 ; i<cIter ; ++i)
n += Atoi4(a[i&fStringMask], 4, 4);
return new long[] { System.nanoTime() - startTime, n };
}
static long[] TestAtoi5(char[][] a) {
long n = 0;
long startTime = System.nanoTime();
for (int i=0 ; i<cIter ; ++i)
n += Atoi5(a[i&fStringMask], 4, 4);
return new long[] { System.nanoTime() - startTime, n };
}
static long[] TestAtoi6(String[] a) {
long n = 0;
long startTime = System.nanoTime();
for (int i=0 ; i<cIter ; ++i)
n += Atoi6(a[i&fStringMask], 4, 4);
return new long[] { System.nanoTime() - startTime, n };
}
static long[] TestAtoi7(String[] a) {
long n = 0;
long startTime = System.nanoTime();
for (int i=0 ; i<cIter ; ++i)
n += Atoi7(a[i&fStringMask], 4, 4);
return new long[] { System.nanoTime() - startTime, n };
}
static int Atoi1(String s, int i0, int cb) {
int n = 0;
boolean fNeg = false; // for unsigned T, this assignment is removed by the optimizer
int i = i0;
int i1 = i + cb;
int ch;
// skip leading crap, scan for -
for ( ; i<i1 && ((ch = s.charAt(i)) > '9' || ch <= '0') ; ++i) {
if (ch == '-')
fNeg = !fNeg;
}
// here is the loop to process the valid number chars.
for ( ; i<i1 ; ++i)
n = n*10 + (s.charAt(i) - '0');
return (fNeg) ? -n : n;
}
static int Atoi2(String s, int i0, int cb) {
int n = 0;
for (int i=i0 ; i<i0+cb ; ++i) {
char ch = s.charAt(i);
n = n*10 + ((ch <= '0') ? 0 : ch - '0');
}
return n;
}
static int Atoi3(char[] s, int i0, int cb) {
int n = 0, i = i0, i1 = i + cb;
// skip leading spaces or zeros
for ( ; i<i1 && s[i] <= '0' ; ++i) { }
// loop to process the valid number chars.
for ( ; i<i1 ; ++i)
n = n*10 + (s[i] - '0');
return n;
}
static int Atoi4(char[] s, int i0, int cb) {
int n = 0;
// loop to process the valid number chars.
for (int i=i0 ; i<i0+cb ; ++i) {
char ch = s[i];
n = n*10 + ((ch <= '0') ? 0 : ch - '0');
}
return n;
}
static int Atoi5(char[] s, int i0, int cb) {
int ch, n = 0, i = i0, i1 = i + cb;
// skip leading crap or zeros
for ( ; i<i1 && ((ch = s[i]) <= '0' || ch > '9') ; ++i) { }
// loop to process the valid number chars.
for ( ; i<i1 && (ch = s[i] - '0') >= 0 && ch <= 9 ; ++i)
n = n*10 + ch;
return n;
}
static int Atoi6(String data, int start, int length) {
int number = 0;
for (int i = start; i <= start + length; i++) {
if (Character.isDigit(data.charAt(i))) {
number = (number * 10) + (data.charAt(i) - 48);
}
}
return number;
}
static int Atoi7(String data, int start, int length) {
int number = 0;
for (int i = start; i <= start + length; i++) {
char ch = data.charAt(i);
if (ch >= '0' && ch <= '9') {
number = (number * 10) + (ch - 48);
}
}
return number;
}
}
【问题讨论】:
-
那么如果字符串是“ABC123DEF456”,那么结果整数是123还是123456?
-
@BrandonLing:不完全;你必须先去掉非数字字符。
-
数字部分是否总是从索引4开始?
-
子字符串总是数字。我们在编译时也知道字符串中的位置和长度。
-
字符串是不可变的;对 String 进行的任何操作都会创建一个新操作。除非您真的想处理数组,否则创建新字符串的开销是如此微不足道,在我们知道这是一个大问题之前,没有理由尝试围绕它编写代码。
标签: java string integer type-conversion