感谢@Emil_Wozniak 发布完整代码。我为此苦苦挣扎了一段时间,没有意识到eliminateOutliers() 实际上返回了异常值,而不是消除了它们的列表。 isOutOfBounds() 方法也令人困惑,因为它实际上在值为 IN 边界时返回 TRUE。以下是我对一些(恕我直言)改进的更新:
- eliminateOutliers() 方法返回已删除异常值的输入列表
- 添加了 getOutliers() 方法来获取异常值列表
- 删除了令人困惑的 isOutOfBounds() 方法以支持简单的过滤表达式
- 扩展了 N 列表以支持多达 30 个输入值
- 防止输入列表太大或太小时出现越界错误
- Made stats 方法(mean、stddev、variance)静态实用方法
- 只计算一次上限/下限,而不是每次比较时计算
- 在 ctor 上提供输入列表并存储为实例变量
- 重构以避免使用与实例和局部变量相同的变量名
代码:
/**
* Implements an outlier removal algorithm based on https://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/dixon.htm#:~:text=It%20can%20be%20used%20to,but%20one%20or%20two%20observations).
* Original Java code by Emil Wozniak at https://stackoverflow.com/questions/18805178/how-to-detect-outliers-in-an-arraylist
*
* Reorganized, made more robust, and clarified many of the methods.
*/
import java.util.List;
import java.util.stream.Collectors;
public class DixonTest {
protected List<Double> criticalValues =
List.of( // Taken from https://sebastianraschka.com/Articles/2014_dixon_test.html#2-calculate-q
// Alfa level of 0.1 (90% confidence)
0.941, // N=3
0.765, // N=4
0.642, // ...
0.56,
0.507,
0.468,
0.437,
0.412,
0.392,
0.376,
0.361,
0.349,
0.338,
0.329,
0.32,
0.313,
0.306,
0.3,
0.295,
0.29,
0.285,
0.281,
0.277,
0.273,
0.269,
0.266,
0.263,
0.26 // N=30
);
// Stats calculated on original input data (including outliers)
private double scaleOfElimination;
private double mean;
private double stdDev;
private double UB;
private double LB;
private List<Double> input;
/**
* Ctor taking a list of values to be analyzed.
* @param input
*/
public DixonTest(List<Double> input) {
this.input = input;
// Create statistics on the original input data
calcStats();
}
/**
* Utility method returns the mean of a list of values.
* @param valueList
* @return
*/
public static double getMean(final List<Double> valueList) {
double sum = valueList.stream()
.mapToDouble(value -> value)
.sum();
return (sum / valueList.size());
}
/**
* Utility method returns the variance of a list of values.
* @param valueList
* @return
*/
public static double getVariance(List<Double> valueList) {
double listMean = getMean(valueList);
double temp = valueList.stream()
.mapToDouble(a -> a)
.map(a -> (a - listMean) * (a - listMean))
.sum();
return temp / (valueList.size() - 1);
}
/**
* Utility method returns the std deviation of a list of values.
* @param input
* @return
*/
public static double getStdDev(List<Double> valueList) {
return Math.sqrt(getVariance(valueList));
}
/**
* Calculate statistics and bounds from the input values and store
* them in class variables.
* @param input
*/
private void calcStats() {
int N = Math.min(Math.max(0, input.size() - 3), criticalValues.size()-1); // Changed to protect against too-small or too-large lists
scaleOfElimination = criticalValues.get(N).floatValue();
mean = getMean(input);
stdDev = getStdDev(input);
UB = mean + stdDev * scaleOfElimination;
LB = mean - stdDev * scaleOfElimination;
}
/**
* Returns the input values with outliers removed.
* @param input
* @return
*/
public List<Double> eliminateOutliers() {
return input.stream()
.filter(value -> value>=LB && value <=UB)
.collect(Collectors.toList());
}
/**
* Returns the outliers found in the input list.
* @param input
* @return
*/
public List<Double> getOutliers() {
return input.stream()
.filter(value -> value<LB || value>UB)
.collect(Collectors.toList());
}
/**
* Test and sample usage
* @param args
*/
public static void main(String[] args) {
List<Double> testValues = List.of(1200.0,1205.0,1220.0,1194.0,1212.0);
DixonTest outlierDetector = new DixonTest(testValues);
List<Double> goodValues = outlierDetector.eliminateOutliers();
List<Double> badValues = outlierDetector.getOutliers();
System.out.println(goodValues.size()+ " good values:");
for (double v: goodValues) {
System.out.println(v);
}
System.out.println(badValues.size()+" outliers detected:");
for (double v: badValues) {
System.out.println(v);
}
// Get stats on remaining (good) values
System.out.println("\nMean of good values is "+DixonTest.getMean(goodValues));
}
}