#!/usr/bin/python3.6
# -*- coding: utf-8 -*-
import re
import os
import csv
# CSV的當前路徑
ALL_NEWS_CSV_PATH = "C:/Users/86177/Desktop/ALLNEWS數(shù)據/news_csv/allNews.csv" # csv文件的路徑
# CSV的當前路徑
ALL_NEWS_SOURCE_DIR = r'C:/Users/86177/Desktop/ALLNEWS數(shù)據/allNews/allNews'
# 保存路徑
ALL_NEWS_SAVE_DIR = r'D:/data'
def read_news(news_path):
"""
:param news_path: 新聞文件路徑
:return:
"""
with open(news_path, "r") as fr: # 以讀的方式打開
content = fr.read() # 讀取文件到內存
# print(content)
# 時間分割類型(如2017.01.06)
# content_item_list = re.split(r"(\d{4}[.-]\d{1,2}[.-]\d{1,2})", content)
# 打印匹配內容
# for item in content_item_list:
# pass
# print(item)
# 如果規(guī)則生效嗽交,返回匹配內容
# if content_item_list:
# return content_item_list
# 有序標題分割 (如一颂斜、xxxx)
# 后續(xù)擴展規(guī)則
# 默認按照換行分割
content_item_list = content.split('\n\n') # 換行符待定,windows默認\r\n
print(content_item_list)
return content_item_list
def read_csv(csv_path):
"""
:param csv_path: csv導航文件路徑
:return:
"""
news_summary_list = [] # 存儲新聞總表
with open(csv_path, "r") as fr: # 同時打開多個文件
reader = csv.reader(fr) # 讀取csv文件
for line in reader: # 循環(huán)讀取每一行
# print(type(line))
# print(line)
# news_item_list = line.split(',') # 存儲單項新聞檢索信息
news_summary_list.append(line)
return news_summary_list
def save_news(save_dir, news_name, news_title, content_item_list):
"""
保存一條新聞內容
:param save_dir:
:param news_name:
:param news_title:
:param content_item_list:
:return:
"""
# 步驟一盒让,保存標題
save_news_name_title = news_name + '-' + '1' + '.txt'
save_news_path_title = os.path.join(save_dir, save_news_name_title)
with open(save_news_path_title, "w+") as fw: # 追加寫入標題到文件
fw_title = fw.write(news_title) # 把標題內容寫到fw里
for index, item in enumerate(content_item_list):
save_news_name_content = news_name + '-' + str(index + 2) + '.txt'
save_news_path_content = os.path.join(save_dir, save_news_name_content)
with open(save_news_path_content, "w+") as fw: # 追加寫入新聞內容到文件
fw_news = fw.write(item) # 把每一段內容寫到fw里
def save_all_news(news_summary_list):
for index, item in enumerate(news_summary_list):
try:
news_name = item[0]
news_title = item[1]
news_category = item[3]
news_file_name = item[0] + '.txt'
news_path = os.path.join(ALL_NEWS_SOURCE_DIR, news_file_name)
content_item_list = read_news(news_path)
dst_save_dir = os.path.join(ALL_NEWS_SAVE_DIR, news_category)
if not os.path.exists(dst_save_dir):
os.makedirs(dst_save_dir)
save_news(dst_save_dir, news_name, news_title, content_item_list)
except Exception as err:
print(err)
continue
def main():
# csv_path = os.path.join(ALL_NEWS_SOURCE_DIR, 'allNews.csv')
news_summary_list = read_csv(ALL_NEWS_CSV_PATH)
save_all_news(news_summary_list)
if __name__ == '__main__':
main()
修正版:
#!/usr/bin/python3.6
# -*- coding: utf-8 -*-
import re
import os
import csv
# CSV的當前路徑
ALL_NEWS_CSV_PATH = "C:/Users/86177/Desktop/ALLNEWS數(shù)據/news_csv/allNews.csv" # csv文件的路徑
# CSV的當前路徑
ALL_NEWS_SOURCE_DIR = r'C:/Users/86177/Desktop/ALLNEWS數(shù)據/allNews/allNews'
# 保存路徑
ALL_NEWS_SAVE_DIR = r'D:/data'
def read_news(news_path):
"""
:param news_path: 新聞文件路徑
:return:
"""
with open(news_path, "r") as fr: # 以讀的方式打開
content = fr.read() # 讀取文件到內存
print(content)
# 時間分割類型(如2017.01.06)
# content_item_list = re.split(r"(\d{4}[.-]\d{1,2}[.-]\d{1,2})", content)
# 打印匹配內容
# for item in content_item_list:
# pass
# print(item)
# 如果規(guī)則生效邑茄,返回匹配內容
# if content_item_list:
# return content_item_list
# 有序標題分割 (如一俊啼、xxxx)
# 后續(xù)擴展規(guī)則
# 默認按照換行分割,去掉首尾空格
content_item_list = content.strip().split('\n\n') # 換行符待定,windows默認\r\n
# print(content_item_list)
# 加篩選條件清除空文件和無意義文件
return content_item_list
def read_csv(csv_path):
"""
:param csv_path: csv導航文件路徑
:return:
"""
news_summary_list = [] # 存儲新聞總表
with open(csv_path, "r") as fr: # 同時打開多個文件
reader = csv.reader(fr) # 讀取csv文件
for line in reader: # 循環(huán)讀取每一行
# print(type(line))
# print(line)
# news_item_list = line.split(',') # 存儲單項新聞檢索信息
news_summary_list.append(line)
return news_summary_list
def save_news(save_dir, news_name, news_title, content_item_list):
"""
保存一條新聞內容
:param save_dir:
:param news_name:
:param news_title:
:param content_item_list:
:return:
"""
# 步驟一同木,保存標題
save_news_name_title = news_name + '-' + '1' + '.txt'
save_news_path_title = os.path.join(save_dir, save_news_name_title)
with open(save_news_path_title, "w+") as fw: # 追加寫入標題到文件
fw_title = fw.write(news_title) # 把標題內容寫到fw里
for index, item in enumerate(content_item_list):
save_news_name_content = news_name + '-' + str(index + 2) + '.txt'
save_news_path_content = os.path.join(save_dir, save_news_name_content)
with open(save_news_path_content, "w+") as fw: # 追加寫入新聞內容到文件
fw_news = fw.write(item) # 把每一段內容寫到fw里
def save_all_news(news_summary_list):
for index, item in enumerate(news_summary_list):
try:
news_name = item[0]
news_title = item[1]
news_category = item[3]
news_file_name = item[0] + '.txt'
news_path = os.path.join(ALL_NEWS_SOURCE_DIR, news_file_name)
content_item_list = read_news(news_path)
dst_save_dir = os.path.join(ALL_NEWS_SAVE_DIR, news_category)
if not os.path.exists(dst_save_dir):
os.makedirs(dst_save_dir)
save_news(dst_save_dir, news_name, news_title, content_item_list)
except Exception as err:
print(err)
continue
def main():
# csv_path = os.path.join(ALL_NEWS_SOURCE_DIR, 'allNews.csv')
news_summary_list = read_csv(ALL_NEWS_CSV_PATH)
save_all_news(news_summary_list)
if __name__ == '__main__':
main()
版本3:清除空文件和小于4個字符的文件
#!/usr/bin/python3.6
# -*- coding: utf-8 -*-
import os
import csv
# CSV的當前路徑
ALL_NEWS_CSV_PATH = "C:/Users/86177/Desktop/ALLNEWS數(shù)據/news_csv/allNews.csv" # csv文件的路徑
# CSV的當前路徑
ALL_NEWS_SOURCE_DIR = r'C:/Users/86177/Desktop/ALLNEWS數(shù)據/allNews/allNews'
# 保存路徑
ALL_NEWS_SAVE_DIR = r'D:/data'
def read_news(news_path):
"""
:param news_path: 新聞文件路徑
:return:
"""
with open(news_path, "r") as fr: # 以讀的方式打開
content = fr.readlines() # 讀取文件到內存
content_item_list = list() # 新建空列表保存line(str)
for line in content: # 依次讀取每行
line = line.strip() # 去掉每行頭尾空白
if not len(line) or len(line) < 4: # 判斷是否是空行或小于四個字符
continue # 是的話泉手,跳過不處理
content_item_list.append(line)
print(content)
# 時間分割類型(如2017.01.06)
# content_item_list = re.split(r"(\d{4}[.-]\d{1,2}[.-]\d{1,2})", content)
# 打印匹配內容
# for item in content_item_list:
# pass
# print(item)
# 如果規(guī)則生效偶器,返回匹配內容
# if content_item_list:
# return content_item_list
# 有序標題分割 (如一、xxxx)
# 后續(xù)擴展規(guī)則
# 默認按照換行分割,去掉首尾空格
# content_item_list = content.strip().split('\n\n') # 第一種切分方法颊郎,windows默認\r\n
# print(content_item_list)
# 加篩選條件清除空文件和無意義文件
return content_item_list
def read_csv(csv_path):
"""
:param csv_path: csv導航文件路徑
:return:
"""
news_summary_list = [] # 存儲新聞總表
with open(csv_path, "r") as fr: # 同時打開多個文件
reader = csv.reader(fr) # 讀取csv文件
for line in reader: # 循環(huán)讀取每一行
# print(type(line))
# print(line)
# news_item_list = line.split(',') # 存儲單項新聞檢索信息
news_summary_list.append(line)
return news_summary_list
def save_news(save_dir, news_name, news_title, content_item_list):
"""
保存一條新聞內容
:param save_dir:
:param news_name:
:param news_title:
:param content_item_list:
:return:
"""
# 步驟一霎苗,保存標題
save_news_name_title = news_name + '-' + '1' + '.txt'
save_news_path_title = os.path.join(save_dir, save_news_name_title)
with open(save_news_path_title, "w+") as fw: # 追加寫入標題到文件
fw_title = fw.write(news_title) # 把標題內容寫到fw里
for index, item in enumerate(content_item_list):
save_news_name_content = news_name + '-' + str(index + 2) + '.txt'
save_news_path_content = os.path.join(save_dir, save_news_name_content)
with open(save_news_path_content, "w+") as fw: # 追加寫入新聞內容到文件
fw_news = fw.write(item) # 把每一段內容寫到fw里
def save_all_news(news_summary_list):
for index, item in enumerate(news_summary_list):
try:
news_name = item[0]
news_title = item[1]
news_category = item[3]
news_file_name = item[0] + '.txt'
news_path = os.path.join(ALL_NEWS_SOURCE_DIR, news_file_name)
content_item_list = read_news(news_path)
dst_save_dir = os.path.join(ALL_NEWS_SAVE_DIR, news_category)
if not os.path.exists(dst_save_dir):
os.makedirs(dst_save_dir)
save_news(dst_save_dir, news_name, news_title, content_item_list)
except Exception as err:
print(err)
continue
def main():
# csv_path = os.path.join(ALL_NEWS_SOURCE_DIR, 'allNews.csv')
news_summary_list = read_csv(ALL_NEWS_CSV_PATH)
save_all_news(news_summary_list)
if __name__ == '__main__':
main()
格式整理版本
#!/usr/bin/python3,6 #指定解釋器
# -*- coding: utf-8 -*- #指定編碼
import os
import csv
# CSV的當前路徑
ALL_NEWS_CSV_PATH = r"C:/Users/86177/Desktop/ALLNEWS數(shù)據/news_csv/allNews.csv" # csv文件的路徑
# CSV的當前路徑
ALL_NEWS_SOURCE_DIR = r'C:/Users/86177/Desktop/ALLNEWS數(shù)據/allNews/allNews'
# 保存路徑
ALL_NEWS_SAVE_DIR = r'D:/data'
def read_news(news_path):
"""
:param news_path:csv導航文件路徑
:return: content_item_list
"""
with open(news_path, 'r') as fr:
content = fr.read()
print(content)
content_item_list = content.strip().split('\n\n')
return content_item_list
def read_csv(csv_path):
"""
:param csv_path:
:return:news_summary_list
"""
news_summary_list = [] # 存儲新聞總表
with open(csv_path, 'r') as fr: # 同時打開多個文件
reader = csv.reader(fr) # 讀取csv文件
for line in reader: # 循環(huán)讀取每一行
news_summary_list.append(line)
return news_summary_list
def save_all_news(news_summary_list):
"""
:param news_summary_list:
:return:
"""
# (從csv文件獲取每個新聞的id和標題,種類)
for index, item in enumerate(news_summary_list):
try:
news_name = item[0]
news_title = item[1]
news_category = item[3]
news_file_name = item[0] + '.txt' # 拼接news保存的格式
news_path = os.path.join(ALL_NEWS_SOURCE_DIR, news_file_name) # 拼接news保存路徑
content_item_list = read_news(news_path) # 調用read_news函數(shù)
dst_save_dir = os.path.join(ALL_NEWS_SOURCE_DIR, news_category) # 創(chuàng)建種類文件夾
if not os.path.exists(dst_save_dir):
os.makedirs(dst_save_dir)
# 調用save_news 保存文件
save_news(dst_save_dir, news_name, news_title, content_item_list)
except Exception as err: # 拋出異常
print(err)
continue
def save_news(save_dir, news_name, news_title, content_item_list):
"""
:param save_dir: ???
:param news_name:
:param news_title:
:param content_item_list:
:return:
"""
# 第一步;保存標題(csv文件),#從csv文件讀取數(shù)據保存格式為:9354665-1,txt
save_news_name_title = news_name + '-' + '1' + '.txt'
save_news_path_title = os.path.join(save_dir, save_news_name_title)
with open(save_news_name_title, "w+") as fw: # 追加寫入到標題
fw_title = fw.write(news_title) # 把標題內容寫到fw里
for index, item in enumerate(content_item_list):
save_news_name_content = news_name + '-' + str(index + 2) + '.txt' # 自動編碼
save_news_path_content = os.path.join(save_dir, save_news_name_content) # txt文件保存的路徑
with open(save_news_path_content, "w+") as fw: # 追加寫入新聞內容到文件
fw_news = fw.write(item) # 把每一段內容寫到fw里
def main():
news_summary_list = read_csv(ALL_NEWS_CSV_PATH)
save_all_news(news_summary_list)
if __name__ == '__main__':
main()
第四版
358543.txt_0001
#!/usr/bin/python3.6
# -*- coding: utf-8 -*-
import os
import csv
import glob
# CSV的當前路徑
ALL_NEWS_CSV_PATH = r"C:/Users/86177/Desktop/ALLNEWS數(shù)據/news_csv/allNews.csv" # csv文件的路徑
# CSV的當前路徑
ALL_NEWS_SOURCE_DIR = r'C:/Users/86177/Desktop/ALLNEWS數(shù)據/allNews/allNews'
# 保存路徑
ALL_NEWS_SAVE_DIR = r'D:/data003'
def read_news(news_path):
"""
:param news_path: 新聞文件路徑
:return:
"""
# summary_content = []
with open(news_path, "r") as fr: # 以讀的方式打開
content = fr.readlines() # 讀取文件到內存
# summary_content.append(content)
# if len(summary_content) <= 10000:
# # continue
content_item_list = list() # 新建空列表保存line(str)
for line in content: # 依次讀取每行
line = line.strip() # 去掉每行頭尾空白
if not len(line) or len(line) < 8: # 判斷是否是空行或小于四個字符
continue # 是的話内狸,跳過不處理
content_item_list.append(line)
print(content)
# 時間分割類型(如2017.01.06)
# content_item_list = re.split(r"(\d{4}[.-]\d{1,2}[.-]\d{1,2})", content)
# 打印匹配內容
# for item in content_item_list:
# pass
# print(item)
# 如果規(guī)則生效,返回匹配內容
# if content_item_list:
# return content_item_list
# 有序標題分割 (如一锰瘸、xxxx)
# 后續(xù)擴展規(guī)則
# 默認按照換行分割,去掉首尾空格
# content_item_list = content.strip().split('\n\n') # 第一種切分方法,windows默認\r\n
# print(content_item_list)
# 加篩選條件清除空文件和無意義文件
return content_item_list
def read_csv(csv_path):
"""
:param csv_path: csv導航文件路徑
:return:
"""
news_summary_list = [] # 存儲新聞總表
with open(csv_path, "r") as fr: # 同時打開多個文件
reader = csv.reader(fr) # 讀取csv文件
for line in reader: # 循環(huán)讀取每一行
# print(type(line))
# print(line)
# news_item_list = line.split(',') # 存儲單項新聞檢索信息
news_summary_list.append(line)
return news_summary_list
def save_news(save_dir, news_name, news_title, content_item_list):
"""
保存一條新聞內容
:param save_dir:
:param news_name:
:param news_title:
:param content_item_list:
:return:
"""
# 步驟一避凝,保存標題
save_news_name_title = news_name + '.txt' + '_' + '0001'
save_news_path_title = os.path.join(save_dir, save_news_name_title)
with open(save_news_path_title, "w+") as fw: # 追加寫入標題到文件
fw_title = fw.write(news_title) # 把標題內容寫到fw里
for index, item in enumerate(content_item_list):
# f1 = new DecimalFormat("0000");
# String_paragraphName = strFileName + "_" + f1.format(i);這是落文本編號能不能format一下
index_length = len(str(index + 2))
# 定制文件名
if index_length == 1:
save_news_name_content = news_name + '.txt' + '_000' + str(index + 2)
elif index_length == 2:
save_news_name_content = news_name + '.txt' + '_00' + str(index + 2)
elif index_length == 3:
save_news_name_content = news_name + '.txt' + '_0' + str(index + 2)
else:
save_news_name_content = news_name + '.txt' + '_' + str(index + 2)
save_news_path_content = os.path.join(save_dir, save_news_name_content)
with open(save_news_path_content, "w+") as fw: # 追加寫入新聞內容到文件
fw_news = fw.write(item) # 把每一段內容寫到fw里
def save_all_news(news_summary_list):
for index, item in enumerate(news_summary_list):
try:
news_name = item[0]
news_title = item[1]
news_category = item[3]
news_file_name = item[0] + '.txt'
# 加一個約束,只讀前一千個文件
p = list()
while len(p.append(news_file_name)) <= 1000:
news_path = os.path.join(ALL_NEWS_SOURCE_DIR, news_file_name)
content_item_list = read_news(news_path)
dst_save_dir = os.path.join(ALL_NEWS_SAVE_DIR, news_category) # 創(chuàng)建種類文件夾
if not os.path.exists(dst_save_dir):
os.makedirs(dst_save_dir)
save_news(dst_save_dir, news_name, news_title, content_item_list)
except Exception as err:
print(err)
continue
def main():
# csv_path = os.path.join(ALL_NEWS_SOURCE_DIR, 'allNews.csv')
news_summary_list = read_csv(ALL_NEWS_CSV_PATH)
save_all_news(news_summary_list)
if __name__ == '__main__':
main()