將文檔的分詞結(jié)果轉(zhuǎn)化為非嚴(yán)格的UCI格式控漠,即沒有前三行的統(tǒng)計信息洛波,因為這三行在LightLDA中沒有使用到巩搏。
分詞后結(jié)果的格式
doc1seqword1 word2 word3...
doc2seqword2 word3...
doc3seqword2 word3
seq是自定義的分割符扒秸,分割文檔名和文檔內(nèi)容的分詞結(jié)果播演,作為參數(shù)傳入函數(shù)
/**
* 將文本文件轉(zhuǎn)化為UCI模式
* @param seq 文檔與分詞結(jié)果的分隔符
* @param filePath input 文本路徑
* @param docwordPath output 轉(zhuǎn)化后的文檔信息存放路徑
* @param vocabPath output 轉(zhuǎn)化后的詞匯信息存放路徑
* @throws Exception
*/
public static void text2UCI(String seq, String filePath, String docwordPath, String vocabPath) throws Exception{
//先將分詞結(jié)果中的各個詞打亂賦予id存放到單詞表中
//再對文檔進行處理
//文檔需要根據(jù)文檔id排序,wordid從1開始
//單詞不能有空格或tab
BufferedReader br = new BufferedReader(new FileReader(filePath));
BufferedWriter vocab_bw = new BufferedWriter(new FileWriter(vocabPath));
BufferedWriter doc_bw = new BufferedWriter(new FileWriter(docwordPath));
HashSet<String> vocabs = new HashSet<>();
String doc = null;
while((doc = br.readLine()) != null){
doc = doc.split(seq)[1];
doc = doc.trim();
if(doc.length() == 0){
continue;
}
String[] words = doc.split(" +"); //按空格分組
for(String word : words){
vocabs.add(word);
}
}
br.close();
List<String> vocab_list = new ArrayList<>(vocabs);
HashMap<String, Integer> vocab_id = new HashMap<>();
int id = 1;
for(String token : vocab_list){
vocab_id.put(token, id++);
vocab_bw.write(token);
vocab_bw.newLine();
vocab_bw.flush(); //寫入到詞匯表文件
}
vocab_bw.close();
//防止文件過大伴奥,就不用mark和reset了
br = new BufferedReader(new FileReader(filePath));
int doc_id = 1; //從1開始
while((doc = br.readLine()) != null){
doc = doc.split(seq)[1];
doc = doc.trim();
if(doc.length() == 0) continue;
String[] words = doc.split(" +"); //按空格分組
int[] ids = new int[words.length];
for(int i = 0; i < words.length; ++i){
ids[i] = vocab_id.get(words[i]); //存放id
}
Arrays.sort(ids);
HashMap<Integer, Integer> id_cnt = new HashMap<>();
for(int word_id : ids){
int value = id_cnt.containsKey(word_id) ? id_cnt.get(word_id) + 1 : 1;
id_cnt.put(word_id, value); //更新值
}
StringBuilder doc_info = new StringBuilder();
doc_info.append(doc_id).append(" ").append(ids[0]).append(" ").append(id_cnt.get(ids[0])).append("\n");
for(int i = 1; i < ids.length; ++i){
if(ids[i] == ids[i-1]){
continue;
}
doc_info.append(doc_id).append(" ").append(ids[i]).append(" ").append(id_cnt.get(ids[i])).append("\n");
}
doc_bw.write(doc_info.toString());
doc_bw.flush();
doc_id++;
}
br.close();
doc_bw.close();
System.out.println("vocab size : " + vocab_list.size());
System.out.println("doc size: " + (doc_id - 1));
}