查看类似于@glennjackman 提出的解决方案:
- 找到公共前缀
- 找到共同的后缀
- 去掉通用前缀/后缀,剩下的就是区别
假设:
- 文件名列表以逗号分隔的字符串形式提供
- 可变数量的文件名
- 逐个字符比较
- 没有分隔符
- 假定由连续字符组成的单个“差异”,例如,在比较
aBcDe 和aXcYe 时,我们认为c 不常见,因此差异将报告为BcD 和@987654325 @
使用awk 的一个想法,应该比bash-level 循环有一些性能改进:
awk '
# function to return an absolute value of a number
function abs(v) { return v < 0 ? -v : v }
# function to determine if each string has the same character at a given offset;
# return 0 if "no", return 1 if "yes"
function equal() {
for ( i=1; i<=n; i++ ) {
pos = offset <= 0 ? length(fname[i]) + offset : offset
x = substr(fname[i],pos,1)
if ( i == 1 ) curr = x
if ( x != curr ) return 0
}
return 1
}
# for now assume strings input using a here-string, and strings are delimited by a comma
FNR==1 { n=split($0,fname,",")
exit # skip to END processing
}
END {
# twice through the outer "for" loop:
# op = 1 => prefix processing
# op = -1 => suffix processing
# "op" will be used to increment/decrement our offset pointer to
# perform the character-by-character comparison
for ( op=1; op>=-1; op=op-2 ) {
offset = op == 1 ? 1 : 0 # determine initial offset based on op (prefix vs suffix)
# if all strings have the same character @ a given offset then update our pfx/sfx pointers
while ( equal() && abs(offset) <= length(fname[1]) ) {
if ( op == 1 ) pfx = offset
else sfx = offset
offset = offset + op # go to next offset
}
}
if ( pfx == "" ) pfx=0 # if no common prefix, default to 0
if ( sfx == "" ) sfx=1 # if no common suffix, default to 1
# use substr() and our pfx/sfx offsets to display the difference
for ( i=1; i<=n; i++ )
print substr(fname[i], pfx+1, length(fname[i]) - pfx - 1 + sfx )
}' <<< "${in}"
注意事项:
- 此时有点冗长;或许可以精简一点...
- 可以修改代码以直接使用“正常”文件列表(例如,将
find 的输出通过管道传输到awk);一个想法是只处理第一条记录 (FNR==1) 并将 FILENAME 填充到数组中
测试结果:
# in='Exp1_ML_Rep1.txt,Exp1_ML_Rep2.txt,Exp1_ML_Rep3.txt'
1
2
3
# in='Exp2_DT_10ng_55C_1_User1.png,Exp2_DT_10ng_55C_2_User1.png,Exp2_DT_10ng_55C_3_User1.png'
1
2
3
# in='x_foo13,x_bar13,x_baz13,x_qux13'
foo
bar
baz
qux
# in='x_foo13,x_bar13,x_baz13,x_abcde23'
foo1
bar1
baz1
abcde2
# in='abcde.123,abcde.123,abcde.123' # identical
# three
# blank
# lines
# in='abc,def,123456,xyz$$' # nothing in common
abc
def
123456
xyz$$