事实上,你正在使用一种相当有缺陷的方式来实现你的目标,你需要使用 numpy 来提高性能。
import numpy as np
import matplotlib.pyplot as plt
stock_price_spy = np.loadtxt('SPY.csv', dtype=float, delimiter=',', skiprows=1, usecols=4)
#here you have nothing else than the 5th column of your csv, this cuts the bottleneck in memory.
n, bins, patches = plt.hist( stock_price_spy, 50 )
plt.show()
我没有测试它,但它应该可以工作。
我建议你使用英特尔的优化版 python。最好管理这种过程。 Intel python distribution
添加测试代码。因为有些人试图误导并且缺少真正的论据,panda 使用作为字典的 Dataframes,而不是 numpy 数组。 numpy 数组几乎快两倍。
import numpy as np
import pandas as pd
import random
import csv
import matplotlib.pyplot as plt
import time
#Creating a random csv file 6 x 4871, simulating the problem.
rows = 4871
columns = 6
fields = ['one', 'two', 'three', 'four', 'five', 'six']
write_a_csv = csv.DictWriter(open("random.csv", "w"),
fieldnames=fields)
for i in range(0, rows):
write_a_csv.writerow(dict([
('one', random.random()),
('two', random.random()),
('three', random.random()),
('four', random.random()),
('five', random.random()),
('six', random.random())
]))
start_old = time.clock()
spy = pd.read_csv( 'random.csv' )
print(type(spy))
stock_price_spy = spy.values[ :, 5 ]
n, bins, patches = plt.hist( stock_price_spy, 50 )
plt.show()
end_old = time.clock()
total_time_old = end_old - start_old
print(total_time_old)
start_new = time.clock()
stock_price_spy_new = np.loadtxt('random.csv', dtype=float,
delimiter=',', skiprows=1, usecols=4)
print(type(stock_price_spy_new))
#here you have nothing else than the 5th column of your csv, this cuts the bottleneck in memory.
n, bins, patches = plt.hist( stock_price_spy_new, 50 )
plt.show()
end_new = time.clock()
total_time_new = end_new - start_new
print(total_time_new)