快速讀取csv文件 去空行 自定義字段順序 編碼 日志級(jí)別 忽略和重新請(qǐng)求 增加最大線程池
快速讀取csv,有進(jìn)度條刽锤。
# “達(dá)觀杯”csv數(shù)據(jù)文件讀取
import time
import pandas as pd
from tqdm import tqdm
def reader_pandas(file, chunkSize=100000, patitions=10 ** 4):
reader = pd.read_csv(file, iterator=True)
chunks = []
with tqdm(range(patitions), 'Reading ...') as t:
for _ in t:
try:
chunk = reader.get_chunk(chunkSize)
chunks.append(chunk)
except StopIteration:
break
return pd.concat(chunks, ignore_index=True)
print(reader_pandas("./data/train_set.csv"))
if __name__ == '__main__':
from scrapy import cmdline
cmdline.execute('scrapy crawl Pakistan_thenews'.split())
# cmdline.execute('scrapy crawl Pakistan_thenews -o ./csv_file/Pakistan_thenews_p.csv -t csv'.split())
settings.py
# 自定義字段順序
FEED_EXPORT_FIELDS = [
'country',
'category',
'data_url',
'title',
'abstract',
'content',
'img_url',
'press_time',
]
# 在同層目錄,settings.py文件指定分隔符
# CSV_DELIMITER = '\t'
# 編碼
FEED_EXPORT_ENCODING = "gb18030"
# 日志級(jí)別
# LOG_LEVEL = 'INFO'
# LOG_LEVEL = 'ERROR'
# LOG_FILE = 'mySpider.log'
# 遇到錯(cuò)誤忽略掉不重新請(qǐng)求贴汪,則設(shè)成[]
# RETRY_HTTP_CODES = []
RETRY_HTTP_CODES = [500, 502, 503, 504, 508, 400, 403, 404, 408, 520]
# 增加最大線程池
REACTOR_THREADPOOL_MAXSIZE = 1
去空行
# scrapy.exporters.CsvItemExporter冯遂,在io.TextIOWrapper加入?yún)?shù)newline=''蕊肥,問題解決
class CsvItemExporter(BaseItemExporter):
def __init__(self, file, include_headers_line=True, join_multivalued=',', **kwargs):
self._configure(kwargs, dont_fail=True)
if not self.encoding:
self.encoding = 'utf-8'
self.include_headers_line = include_headers_line
self.stream = io.TextIOWrapper(
file,
newline='',
line_buffering=False,
write_through=True,
encoding=self.encoding
) if six.PY3 else file
self.csv_writer = csv.writer(self.stream, **kwargs)
self._headers_not_written = True