参考网站:
https://github.com/fengdu78/machine_learning_beginner/tree/master/pandas/Pandas_Exercises
https://github.com/ajcr/100-pandas-puzzles
https://blog.csdn.net/jclian91/article/details/84289537
1. Pandas_Exercises
1.Getting and knowing
- Chipotle
- Occupation
- World Food Facts
2.Filtering and Sorting
- Chipotle
- Euro12
- Fictional Army
3.Grouping
- Alcohol Consumption
- Occupation
- Regiment
4.Apply
- Students
- Alcohol Consumption
- US_Crime_Rates
5.Merge
- Auto_MPG
- Fictitious Names
- House Market
6.Stats
- US_Baby_Names
- Wind_Stats
7.Visualization
- Chipotle
- Titanic Disaster
- Scores
- Online Retail
- Tips
8.Creating Series and DataFrames
- Pokemon
9.Time Series
- Apple_Stock
- Getting_Financial_Data
- Investor_Flow_of_Funds_US
10.Deleting
- Iris
- Wine
2. Pandas 100
#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd # In[2]: pd.__version__ # In[3]: pd.show_versions() # In[5]: import numpy as np data = {\'animal\': [\'cat\', \'cat\', \'snake\', \'dog\', \'dog\', \'cat\', \'snake\', \'cat\', \'dog\', \'dog\'], \'age\': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3], \'visits\': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1], \'priority\': [\'yes\', \'yes\', \'no\', \'yes\', \'no\', \'no\', \'no\', \'yes\', \'no\', \'no\']} labels = [\'a\', \'b\', \'c\', \'d\', \'e\', \'f\', \'g\', \'h\', \'i\', \'j\'] # In[52]: df = pd.DataFrame(data, index=labels,dtype=float) # In[53]: df.info() # In[8]: df.describe() #对整型数据或浮点数进行求均值、四分位数 # In[12]: #Return the first 3 rows of the DataFrame df. df.iloc[:3] #返回三行数据,其余行的数据不返回 # In[13]: df.head(5) #DataFrame由行和列组成,label为行标签 # In[14]: df.loc[:, [\'animal\', \'age\']] # In[20]: df.loc[:,[\'animal\', \'priority\']] #返回animal, priority列的所有数据 # In[21]: #Select the data in rows [3, 4, 8] and in columns [\'animal\', \'age\']. df.loc[df.index[[3, 4, 8]], [\'animal\', \'age\']] #可见loc有时可以起到与iloc同样的作用 # In[23]: #Select only the rows where the number of visits is greater than 3. df[df[\'visits\'] > 3] # In[24]: df[df[\'age\'].isnull()] # In[25]: df[(df[\'animal\'] == \'cat\') & (df[\'age\'] < 3)] # In[26]: df[df[\'age\'].between(2, 4)] # In[27]: df.loc[\'f\', \'age\'] = 1.5 #区分loc, iloc,loc[a, b],a为行属性,b为列属性 # In[28]: df # In[29]: df[\'visits\'].sum() # In[30]: df.groupby(\'animal\')[\'age\'].mean() #以animal分类,求age均值 # In[31]: df.loc[\'k\'] = [5.5, \'dog\', \'no\', 2] #loc增加k行 # In[32]: df # In[33]: df = df.drop(\'k\') #删除k行 # In[34]: df[\'animal\'].value_counts() #以animal分类,暂不明白,value_counts对何计数 # In[35]: df.sort_values(by=[\'age\', \'visits\'], ascending=[False, True]) #按age降序,False降序,True升序,两者冲突时以前者为准 # In[38]: df.sort_values(by = [\'visits\'],ascending = [False]) # In[39]: #The \'priority\' column contains the values \'yes\' and \'no\'. Replace this column with a column of boolean values: \'yes\' should be True and \'no\' should be False. df[\'priority\'] = df[\'priority\'].map({\'yes\': True, \'no\': False}) #map可以快速匹配字典的key,返回value # In[40]: df # In[41]: df[\'animal\'] = df[\'animal\'].replace(\'snake\', \'python\') # In[42]: df # In[58]: df.loc[\'d\', \'age\'] = 1 df.loc[\'h\', \'age\'] =5 # In[59]: df # In[62]: df # In[63]: #For each animal type and each number of visits, find the mean age. In other words, each row is an animal, each column is a number of visits and the values are the mean ages (hint: use a pivot table). df.pivot_table(index= [\'animal\'], columns= [\'visits\'], values=[\'age\'], aggfunc=[\'mean\']) #暂时有问题,因age存在NaN,故先填充数据,此外发现还需要先制定数据类型 # In[67]: df = pd.DataFrame({\'A\': [1, 2, 2, 3, 4, 5, 5, 5, 6, 7, 7]}) # In[68]: df # In[69]: #How do you filter out rows which contain the same integer as the row immediately above? df.loc[df[\'A\'].shift() != df[\'A\']] # In[72]: df = pd.DataFrame(np.random.random(size=(5, 3))) # In[73]: df # In[83]: df.sub(df.mean(axis=1), axis =0) #axis=1 对每行数据处理,输出每行,axis=0,对每列数据处理 # In[78]: df.mean(axis = 1) # In[84]: df = pd.DataFrame(np.random.random(size=(5, 10)), columns=list(\'abcdefghij\')) # In[85]: df # In[86]: df.sum().idxmin() # In[87]: #How do you count how many unique rows a DataFrame has (i.e. ignore all rows that are duplicates) len(df.drop_duplicates(keep=False)) # In[108]: df = pd.DataFrame(np.random.random(size=(6,10)), columns =list(\'abcdefghij\') ) # In[109]: df # In[112]: df.loc[4, \'g\'] = np.nan df.loc[1, \'d\'] = np.nan df.loc[5, \'f\'] = np.nan # In[113]: df # In[114]: (df.isnull().cumsum(axis=1) == 3).idxmax(axis=1) # In[115]: df = pd.DataFrame({\'grps\': list(\'aaabbcaabcccbbc\'), \'vals\': [12,345,3,1,45,14,4,52,54,23,235,21,57,3,87]}) # In[116]: df # In[117]: df.groupby(\'grps\')[\'vals\'].nlargest(3).sum(level=0) # In[143]: df = pd.DataFrame({\'A\': range(1,101), \'B\': np.random.randint(1,100)}) # In[144]: df # df.groupby(pd.cut(df[\'A\'], np.arange(0, 101, 10)))[\'B\'].sum() # In[146]: df = pd.DataFrame({\'X\': [7, 2, 0, 3, 4, 2, 5, 0, 3, 4]}) # In[147]: df # In[149]: x = (df[\'X\'] != 0).cumsum() y = x != x.shift() df[\'Y\'] = y.groupby((y != y.shift()).cumsum()).cumsum() # df 截止到第29题
3. Pandas Tutorial
4. 函数用法积累
4.1 pd.Categorical
返回字符所属类别的位置索引