之前已經(jīng)分析了倒排索引的過程倒排索引過程解讀 那么噪裕,接下來用代碼實現(xiàn)一下其過程股毫,加深了解:
package search;
import java.util.*;
/**
* 模擬倒排索引(只支持英文)
*
* @author yuyufeng
* @date 2017/10/31
*/
public class InvertedIndex {
/**
* 倒排索引表
*/
Map<String, IndexTable> keywordsIndexTableMap = new HashMap<>();
/**
* 停頓詞
*/
Set<String> stopwords = new HashSet<>();
/**
* 存儲文檔
*/
List<String> store;
public InvertedIndex(List<String> list) {
store = list;
for (int i = 0; i < list.size(); i++) {
String li = keywordsHandler(list.get(i));
String[] array = li.split(" ");
for (int j = 0; j < array.length; j++) {
addKeyWords(array[j], i, j);
}
}
//停頓詞 非關(guān)鍵詞
stopwords.add("is");
stopwords.add("and");
stopwords.add("or");
stopwords.add("the");
}
/**
* 建立索引 建立倒排索引表
*/
public void addKeyWords(String keywords, Integer i, Integer j) {
//去空
if ("".equals(keywords)) {
return;
}
IndexTable indexTable = keywordsIndexTableMap.get(keywords);
if (indexTable == null) {
//插入關(guān)鍵詞信息
indexTable = new IndexTable();
indexTable.keywords = keywords;
//插入關(guān)鍵詞出現(xiàn)的文檔Id,次數(shù)為1(首次出現(xiàn))
indexTable.docTimes.put(i, 1);
//插入關(guān)鍵詞出現(xiàn)的文檔Id,在文檔中位置
indexTable.docIndex.put(i, " "+j);
keywordsIndexTableMap.put(keywords, indexTable);
} else {
//更新關(guān)鍵詞出現(xiàn)的文檔祭陷、頻率
indexTable.docTimes.put(i, indexTable.docTimes.get(i) == null ? 1 : indexTable.docTimes.get(i) + 1);
//更新關(guān)鍵詞出現(xiàn)的文檔趣席、位置
indexTable.docIndex.put(i, (indexTable.docIndex.get(i) == null ? "" : indexTable.docIndex.get(i)) + " " + j);
keywordsIndexTableMap.put(keywords, indexTable);
}
}
//去除無關(guān)詞,統(tǒng)一大小寫
private String keywordsHandler(String doc) {
doc = doc.replaceAll(",|:|\\.", " ");
doc = doc.toLowerCase();
return doc;
}
/**
* 查詢
* @param keywords
* @return
*/
public List<String> search(String keywords) {
keywords = keywordsHandler(keywords);
//用Set可以去重
String[] kewordsArray = keywords.split(" ");
List<String> result = new ArrayList<>();
//<文檔想罕,出現(xiàn)次數(shù)>
Map<Integer, DocTemp> docSortTimes = new HashMap();
for (int i = 0; i < kewordsArray.length; i++) {
String key = kewordsArray[i];
IndexTable indexTable = keywordsIndexTableMap.get(key);
if (indexTable == null) {
continue;
}
//找到關(guān)鍵詞霉涨,統(tǒng)計出現(xiàn)的文檔笙瑟,頻率,出現(xiàn)的文檔數(shù)
for (Integer integer : indexTable.docTimes.keySet()) {
if (docSortTimes.get(integer) == null) {
DocTemp docTemp = new DocTemp();
docTemp.docId = integer;
docTemp.times = 1;
docTemp.allTimes = indexTable.docTimes.get(integer);
docSortTimes.put(integer, docTemp);
} else {
DocTemp docTemp = docSortTimes.get(integer);
docTemp.times++;
docTemp.allTimes += indexTable.docTimes.get(integer);
}
}
}
List<DocTemp> docTemps = new ArrayList<>();
//轉(zhuǎn)化List 排序
for (Integer docId : docSortTimes.keySet()) {
DocTemp docTemp = docSortTimes.get(docId);
docTemps.add(docTemp);
}
//對結(jié)果排序鸠蚪,一個Doc包含關(guān)鍵詞越多师溅,排名越前。如相等蘸鲸,則比較出現(xiàn)總次數(shù) \\這個規(guī)則可以自定義
docTemps.sort(new Comparator<DocTemp>() {
@Override
public int compare(DocTemp o1, DocTemp o2) {
if(o1.times.equals(o2.times)){
return o2.allTimes - o1.times;
}
return o2.times-o1.times;
}
});
//包裝結(jié)果
for (int i = 0; i < docTemps.size(); i++) {
result.add(store.get(docTemps.get(i).docId));
}
return result;
}
public static void main(String[] args) {
List<String> docs = new ArrayList<>();
//docId = 0
String doc = "Inverted index comes from the actual application, it needs to find the record according to the value of the attribute.";
docs.add(doc);
//docId = 1
doc = "Each item Each in the index table includes an attribute value and the address of each record with the attribute value";
docs.add(doc);
//docId = 2
doc = "Attribute values are not determined by records";
docs.add(doc);
//docId = 3
doc = "Rather, the location of the record is determined by the attribute value";
docs.add(doc);
//docId = 4
doc = "It is called inverted index";
docs.add(doc);
//docId = 5
doc = "The data operation is simple: the data used by the search engine is easy to operate, generally speaking, only need to add, delete, change, check several functions";
docs.add(doc);
//docId = 6
doc = "And data has a specific format, you can design simple and efficient applications for these applications";
docs.add(doc);
//建立索引
InvertedIndex invertedIndex = new InvertedIndex(docs);
//查看倒排索引表
System.out.println("關(guān)鍵詞 出現(xiàn)doc[次數(shù)],{位置}");
for (String key : invertedIndex.keywordsIndexTableMap.keySet()) {
System.out.print(key + " ");
IndexTable indexTable = invertedIndex.keywordsIndexTableMap.get(key);
for (Integer integer : indexTable.docTimes.keySet()) {
System.out.print(integer + "[" + indexTable.docTimes.get(integer) + "],");
System.out.print("{" + indexTable.docIndex.get(integer) + "} ");
}
System.out.println("");
}
//執(zhí)行查詢
List<String> results = invertedIndex.search("specific format location determined attribute");
System.out.println("=========== 查詢結(jié)果 ==========");
for (String result : results) {
System.out.println(result);
}
}
}
/**
* 倒排索引表
*/
class IndexTable {
/**
* 關(guān)鍵詞
*/
String keywords;
/**
* 出現(xiàn)頻率
*/
Map<Integer, Integer> docTimes = new HashMap<>();
/**
* 出現(xiàn)位置
*/
Map<Integer, String> docIndex = new HashMap<>();
}
/**
* 用于排序
*/
class DocTemp {
Integer docId;
Integer times; //出現(xiàn)在多少Doc中
Integer allTimes; //出現(xiàn)總次數(shù)
}
運行結(jié)果
關(guān)鍵詞 出現(xiàn)doc[次數(shù)],{位置}
called 4[1],{ 2}
data 5[2],{ 1 7} 6[1],{ 1}
functions 5[1],{ 32}
several 5[1],{ 31}
simple 5[1],{ 4} 6[1],{ 10}
used 5[1],{ 8}
these 6[1],{ 15}
speaking 5[1],{ 19}
find 0[1],{ 11}
record 0[1],{ 13} 1[1],{ 16} 3[1],{ 6}
only 5[1],{ 21}
from 0[1],{ 3}
has 6[1],{ 2}
inverted 0[1],{ 0} 4[1],{ 3}
you 6[1],{ 7}
needs 0[1],{ 9}
add 5[1],{ 24}
actual 0[1],{ 5}
item 1[1],{ 1}
in 1[1],{ 3}
need 5[1],{ 22}
format 6[1],{ 5}
index 0[1],{ 1} 1[1],{ 5} 4[1],{ 4}
includes 1[1],{ 7}
is 3[1],{ 7} 4[1],{ 1} 5[2],{ 3 13}
it 0[1],{ 8} 4[1],{ 0}
check 5[1],{ 30}
an 1[1],{ 8}
easy 5[1],{ 14}
each 1[3],{ 0 2 15}
operate 5[1],{ 16}
determined 2[1],{ 4} 3[1],{ 8}
records 2[1],{ 6}
rather 3[1],{ 0}
values 2[1],{ 1}
comes 0[1],{ 2}
according 0[1],{ 14}
for 6[1],{ 14}
delete 5[1],{ 26}
can 6[1],{ 8}
not 2[1],{ 3}
search 5[1],{ 11}
are 2[1],{ 2}
engine 5[1],{ 12}
and 1[1],{ 11} 6[2],{ 0 11}
of 0[1],{ 18} 1[1],{ 14} 3[1],{ 4}
by 2[1],{ 5} 3[1],{ 9} 5[1],{ 9}
design 6[1],{ 9}
attribute 0[1],{ 20} 1[2],{ 9 19} 2[1],{ 0} 3[1],{ 11}
value 0[1],{ 17} 1[2],{ 10 20} 3[1],{ 12}
table 1[1],{ 6}
a 6[1],{ 3}
address 1[1],{ 13}
efficient 6[1],{ 12}
change 5[1],{ 28}
specific 6[1],{ 4}
generally 5[1],{ 18}
the 0[4],{ 4 12 16 19} 1[3],{ 4 12 18} 3[3],{ 2 5 10} 5[3],{ 0 6 10}
with 1[1],{ 17}
application 0[1],{ 6}
location 3[1],{ 3}
to 0[2],{ 10 15} 5[2],{ 15 23}
operation 5[1],{ 2}
applications 6[2],{ 13 16}
=========== 查詢結(jié)果 ==========
Inverted index comes from the actual application, it needs to find the record according to the value of the attribute.
It is called inverted index
Each item Each in the index table includes an attribute value and the address of each record with the attribute value