由于没有答案,我自己写了。它非常粗糙但有效。它读取一个 URL 列表,提取主机,对它们进行计数,然后用与主机的反频率成比例的索引填充一个鸽巢数组。
package org.xmlcml.cmine.util;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset;
public class URLShuffler {
public static final Logger LOG = Logger.getLogger(URLShuffler.class);
static {
LOG.setLevel(Level.DEBUG);
}
// 以防我们需要额外的分类,但它似乎不适用于中等问题
private static int TOL = 1;
private List<String> urls;
private Multiset<String> domains;
private Map<String, Integer> currentIndexByDomain;
private Map<String, Integer> countByDomain;
private List<String> outputUrls;
public URLShuffler() {
}
public void readURLs(List<String> urls) {
this.urls= urls;
domains = HashMultiset.create();
for (String url : urls) {
String domain = getDomain(url);
domains.add(domain);
}
LOG.debug(domains);
}
// this would be better using java.net.URL
private String getDomain(String url) {
int idx = url.indexOf("//");
if (idx != -1) {
url = url.substring(idx+2);
}
idx = url.indexOf("/");
String domain = url.substring(0, idx);
return domain;
}
public List<String> getShuffledUrls() {
currentIndexByDomain = new HashMap<String, Integer>();
countByDomain = new HashMap<String, Integer>();
outputUrls = new ArrayList<String>();
for (int i = 0; i < urls.size() * TOL; i++) {
outputUrls.add("");
}
// 这是包装 Guava 排序的便捷方法。
for (Multiset.Entry<String> entry : CMineUtil.getEntriesSortedByCount(domains)) {
LOG.debug(entry);
countByDomain.put(entry.getElement(), entry.getCount());
currentIndexByDomain.put(entry.getElement(), entry.getCount() - 1);
}
for (String url : urls) {
String domain = getDomain(url);
Integer currentIndex = currentIndexByDomain.get(domain);
Integer count = countByDomain.get(domain);
int slot = (urls.size() * currentIndex * TOL) / count;
currentIndexByDomain.put(domain, currentIndex - 1);
addUrl(url, slot);
}
return outputUrls;
}
private void addUrl(String url, int slot) {
boolean filled = fillLower(url, slot);
if (!filled) {
fillUpper(url, slot);
}
}
// if slot is not free run upwards till next free slot
private boolean fillUpper(String url, int slot) {
for (int i = slot; i < outputUrls.size(); i++) {
if (fill(url, i)) {
return true;
}
}
return false;
}
// if slot is not free run downwards till next free slot
private boolean fillLower(String url, int slot) {
for (int i = slot; i >= 0; i--) {
if (fill(url, i)) {
return true;
}
}
return false;
}
private boolean fill(String url, int slot) {
if (outputUrls.get(slot).equals("")) {
outputUrls.set(slot, url);
return true;
}
return false;
}
}
```