【发布时间】:2014-06-18 20:05:17
【问题描述】:
我是 ipython 和 pandas 的新手
当我运行 pd.crosstab(df['A'], df['B'])。它有错误MemoryError
数据框有 10,000,000 行。我想可能是数据量太大了。
我用 df.values.nbytes + df.index.nbytes + df.columns.nbytes
检查数据帧的大小内存只有 381 MB。我的服务器有 16GB 内存
如果我运行包含 1,000,000 行的数据框,则没有问题。
希望有人能帮忙。
错误的调试日志:
---------------------------------------------------------------------------
MemoryError Traceback (most recent call last)
<ipython-input-6-199f99c3064f> in <module>()
99 df = df.applymap(lambda x: np.nan if str(x) == "N/A" or len(str(x).strip()) == 0 else x)
100
--> 101 summary_table(df)
<ipython-input-6-199f99c3064f> in summary_table(df)
78 dis_for_cont_vars(df)
79
---> 80 value_count(df)
81 #END summary_table
82
<ipython-input-6-199f99c3064f> in value_count(df)
63 def value_count(df):
64 print "===> Value counts\n"
---> 65 print pd.crosstab(df['A'], df['B'])
66 print "===>\n"
67
/home/deploy/anaconda/lib/python2.7/site-packages/pandas/tools/pivot.pyc in crosstab(rows, cols, values, rownames, colnames, aggfunc, margins, dropna)
368 df['__dummy__'] = 0
369 table = df.pivot_table('__dummy__', rows=rownames, cols=colnames,
--> 370 aggfunc=len, margins=margins, dropna=dropna)
371 return table.fillna(0).astype(np.int64)
372 else:
/home/deploy/anaconda/lib/python2.7/site-packages/pandas/tools/pivot.pyc in pivot_table(data, values, rows, cols, aggfunc, fill_value, margins, dropna)
108 to_unstack = [agged.index.names[i]
109 for i in range(len(rows), len(keys))]
--> 110 table = agged.unstack(to_unstack)
111
112 if not dropna:
/home/deploy/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in unstack(self, level)
3211 """
3212 from pandas.core.reshape import unstack
-> 3213 return unstack(self, level)
3214
3215 #----------------------------------------------------------------------
/home/deploy/anaconda/lib/python2.7/site-packages/pandas/core/reshape.pyc in unstack(obj, level)
416 def unstack(obj, level):
417 if isinstance(level, (tuple, list)):
--> 418 return _unstack_multiple(obj, level)
419
420 if isinstance(obj, DataFrame):
/home/deploy/anaconda/lib/python2.7/site-packages/pandas/core/reshape.pyc in _unstack_multiple(data, clocs)
316 columns=data.columns)
317
--> 318 unstacked = dummy.unstack('__placeholder__')
319 if isinstance(unstacked, Series):
320 unstcols = unstacked.index
/home/deploy/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in unstack(self, level)
3211 """
3212 from pandas.core.reshape import unstack
-> 3213 return unstack(self, level)
3214
3215 #----------------------------------------------------------------------
/home/deploy/anaconda/lib/python2.7/site-packages/pandas/core/reshape.pyc in unstack(obj, level)
420 if isinstance(obj, DataFrame):
421 if isinstance(obj.index, MultiIndex):
--> 422 return _unstack_frame(obj, level)
423 else:
424 return obj.T.stack(dropna=False)
/home/deploy/anaconda/lib/python2.7/site-packages/pandas/core/reshape.pyc in _unstack_frame(obj, level)
459 unstacker = _Unstacker(obj.values, obj.index, level=level,
460 value_columns=obj.columns)
--> 461 return unstacker.get_result()
462
463
/home/deploy/anaconda/lib/python2.7/site-packages/pandas/core/reshape.pyc in get_result(self)
141 # TODO: find a better way than this masking business
142
--> 143 values, value_mask = self.get_new_values()
144 columns = self.get_new_columns()
145 index = self.get_new_index()
/home/deploy/anaconda/lib/python2.7/site-packages/pandas/core/reshape.pyc in get_new_values(self)
185 else:
186 dtype, fill_value = _maybe_promote(values.dtype)
--> 187 new_values = np.empty(result_shape, dtype=dtype)
188 new_values.fill(fill_value)
189
MemoryError:
【问题讨论】:
-
我发现自己在使用交叉表时内存使用超过 16GB,导致任务崩溃
标签: pandas ipython crosstab large-data