正如ironstone13 纠正我的那样,HashSet 没问题,但确实存储了数据。
那么这也可以正常工作:
string[] arr = File.ReadAllLines("file.txt");
HashSet<string> hashes = new HashSet<string>();
for (int i = 0; i < arr.Length; i++)
{
if (!hashes.Add(arr[i])) arr[i] = null;
}
File.WriteAllLines("file2.txt", arr.Where(x => x != null));
此实现的动机是内存性能和哈希冲突。
主要思想是只保留散列,当然它必须返回文件以获取它认为是散列冲突/重复的行,以检测它是哪一个。 (那部分没有实现)。
class Program
{
static string[] arr;
static Dictionary<int, int>[] hashes = new Dictionary<int, int>[1]
{ new Dictionary<int, int>() }
;
static int[] file_indexes = {-1};
static void AddHash(int hash, int index)
{
for (int h = 0; h < hashes.Length; h++)
{
Dictionary<int, int> dict = hashes[h];
if (!dict.ContainsKey(hash))
{
dict[hash] = index;
return;
}
}
hashes = hashes.Union(new[] {new Dictionary<int, int>() {{hash, index}}}).ToArray();
file_indexes = Enumerable.Range(0, hashes.Length).Select(x => -1).ToArray();
}
static int UpdateFileIndexes(int hash)
{
int updates = 0;
for (int h = 0; h < hashes.Length; h++)
{
int index;
if (hashes[h].TryGetValue(hash, out index))
{
file_indexes[h] = index;
updates++;
}
else
{
file_indexes[h] = -1;
}
}
return updates;
}
static bool IsDuplicate(int index)
{
string str1 = arr[index];
for (int h = 0; h < hashes.Length; h++)
{
int i = file_indexes[h];
if (i == -1 || index == i) continue;
string str0 = arr[i];
if (str0 == null) continue;
if (string.CompareOrdinal(str0, str1) == 0) return true;
}
return false;
}
static void Main(string[] args)
{
arr = File.ReadAllLines("file.txt");
for (int i = 0; i < arr.Length; i++)
{
int hash = arr[i].GetHashCode();
if (UpdateFileIndexes(hash) == 0) AddHash(hash, i);
else if (IsDuplicate(i)) arr[i] = null;
else AddHash(hash, i);
}
File.WriteAllLines("file2.txt", arr.Where(x => x != null));
Console.WriteLine("DONE");
Console.ReadKey();
}
}