pandas常用指令
导入pandas
import pandas as pd import modin.pandas as pd
将字典转为pd.Series
d={‘x’:100,“y”:301,“z”:302} s1=pd.Series(d)
利用pd.Series创建pd.DataFrame
l1=[1,2,3] l2=[‘x’,‘y’,‘z’] l3=[1,2,4] l4=[‘x’,‘y’,‘z’] l5=[1,2,6] l6=[‘x’,‘y’,‘z’] s1=pd.Series(l1,index=l2,name=“a”) s2=pd.Series(l3,index=l4,name=“b”) s3=pd.Series(l5,index=l6,name=“c”) #Series作为行 df=pd.DataFrame({s1.name:s1,s2.name:s2,s3.name:s3}) #Series作为列 df=pd.DataFrame([s1,s2,s3])
读取excel,设定第一行所在的行,第一列所在的列,索引列,字符类型
books=pd.read_excel(“Books1.xlsx”,header=3,usecols=‘C:F’,index_col=None,dtype={“ID”:str,“InStore”:str,“Date”:str})
读取txt,csv等,设定分隔符
students3 = pd.read_csv(‘C:/Temp/Students.txt’, sep='|', index_col=‘ID’)
导出excel,设定索引行
books.set_index(“ID”,inplace=True) books.to_excel(“output.xlsx”)
列间计算
books[“Price”]=books[“ListPrice”]*books[“Discount”]
Series应用函数计算
def add_2(x): return x+2 books[“ListPrice”]=books[“ListPrice”].apply(add_2) #或者 books[“ListPrice”]=books[“ListPrice”].apply(lambda x:x+2)
Series转换数据类型
table.Score=table.Score.astype(int)
DataFrame排序
products.sort_values(by=[“Worthy”,“Price”],inplace=True,ascending=[True,False])
DataFrame筛选
students=students.loc[students.Age.apply(lambda a:18<=a<30)].loc[students.Score.apply(lambda a:85<=a<100)]
DataFrame列合并,追加,删除,更改列名,删除与填充空值
students=pd.read_excel(“Student_Score.xlsx”,sheet_name=‘Students’,index_col=‘ID’) scores=pd.read_excel(“Student_Score.xlsx”,sheet_name=‘Scores’,index_col=“ID”) table=students.merge(scores,how=‘left’,on=‘ID’).fillna(0)
# 追加列 students[‘Age’] = 25
删除列
students.drop(columns=[‘Score’, ‘Age’], inplace=True)
插入列
students.insert(1, column=‘Foo’, value=np.repeat(‘foo’, len(students)))
改列名
students.rename(columns={‘Foo’: ‘FOO’, ‘Name’: ‘NAME’}, inplace=True)
设置空值
students[‘ID’] = students[‘ID’].astype(float) for i in range(5, 15): students[‘ID’].at[i] = np.nan
去掉空值
students.dropna(inplace=True)
print(students)
DataFrame行合并,追加,新建,插入,更改,删除与填充空值
import pandas as pd
page_001 = pd.read_excel(‘C:/Temp/Students.xlsx’, sheet_name=‘Page_001’) page_002 = pd.read_excel(‘C:/Temp/Students.xlsx’, sheet_name=‘Page_002’)
追加合并
students = page_001.append(page_002).reset_index(drop=True)
追加新建
stu = pd.Series({‘ID’: 41, ‘Name’: ‘Abel’, ‘Score’: 90}) students = students.append(stu, ignore_index=True)
删除(可切片)
students = students.drop(index=[39, 40])
插入
stu = pd.Series({‘ID’: 100, ‘Name’: ‘Bailey’, ‘Score’: 100}) part1 = students[:21] # .iloc[] is the same part2 = students[21:] students = part1.append(stu, ignore_index=True).append(part2).reset_index(drop=True)
更改
stu = pd.Series({‘ID’: 101, ‘Name’: ‘Danni’, ‘Score’: 101}) students.iloc[39] = stu
设置空值
for i in range(5, 15): students[‘Name’].at[i] = ''
去掉空值
missing = students.loc[students[‘Name’] == ‘'] students.drop(missing.index, inplace=True)
print(students)
DataFrame条件格式,使用style.applymap
import pandas as pd
def low_score_red(s): color = ‘red’ if s<60 else ‘green’ return f’color:{color}’
students = pd.read_excel(‘c:/Temp/Students.xlsx’) students.style.applymap(low_score_red, subset=[‘Test_1’, ‘Test_2’, ‘Test_3’])
def highest_score_green2(col): return [‘background-color:lime’ if v==col.max() else ‘background-color:red’ for v in col] students.style.apply(highest_score_green2, subset=[‘Test_1’, ‘Test_2’, ‘Test_3’])
import seaborn as sns
color_map = sns.light_palette(‘green’, as_cmap=True)
students = pd.read_excel(‘c:/Temp/Students.xlsx’) students.style.background_gradient(cmap=color_map, subset=[‘Test_1’,‘Test_2’,‘Test_3’])
DataFrame填加一列进行复杂函数运算
import pandas as pd import numpy as np
def get_circumcircle_area(l, h): r = np.sqrt(l ** 2 + h ** 2) / 2 return r ** 2 * np.pi
def wrapper(row): return get_circumcircle_area(row[‘Length’], row[‘Height’])
rects = pd.read_excel(‘C:/Temp/Rectangles.xlsx’, index_col=‘ID’) rects[‘Circumcircle Area’] = rects.apply(wrapper, axis=1) print(rects)
DataFrame数据透视
import pandas as pd from datetime import date
orders = pd.read_excel(‘C:/Temp/Orders.xlsx’, dtype={‘Date’: date}) orders[‘Year’] = pd.DatetimeIndex(orders.Date).year groups = orders.groupby([‘Category’, ‘Year’]) s = groups[‘Total’].sum() c = groups[‘ID’].count() pt1 = pd.DataFrame({‘Sum’: s, ‘Count’: c}) pt2 = orders.pivot_table(index=‘Category’, columns=‘Year’, values=‘Total’, aggfunc=np.sum)
print(pt1) print(pt2)
pandas连接数据库
import pyodbc import sqlalchemy import pandas as pd
connection = pyodbc.connect(‘DRIVER={SQL Server}; SERVER=(local); DATABASE=AdventureWorks;USER=sa;PASSWORD=123456’) engine = sqlalchemy.create_engine(‘mssql+pyodbc://sa:123456@(local)/AdventureWorks?driver=SQL+Server’)
query = ‘SELECT FirstName, LastName FROM Person.Person’ df1 = pd.read_sql_query(query, connection) df2 = pd.read_sql_query(query, engine)
pd.options.display.max_columns = 999 print(df1.head()) print(df2.head())
pandas时间索引与时间相关操作
#创建时间索引,bdate_range是工作日 import numpy as np rng = pd.date_range(‘2012-05-01 00:00:00’, periods=3, freq=‘2M’) rng1 = pd.bdate_range(‘2012-05-01 00:00:00’, periods=3, freq=‘2M’) print(rng) ts = pd.Series(np.random.randn(len(rng)), index=rng) print(ts.head()) #更改时间索引间隔 converted = ts.asfreq(‘2M’, method=‘pad’) print(converted.head()) #更改时间的索引类型为Series类型 print(pd.to_datetime(pd.Series(rng))) #时间索引中元素类型为Timestamp print(type(rng[0])) rng[0]
DatetimeIndex([‘2012-05-31’, ‘2012-07-31’, ‘2012-09-30’], dtype=‘datetime64[ns]’, freq=‘2M’) 2012-05-31 0.212964 2012-07-31 0.507259 2012-09-30 0.075358 Freq: 2M, dtype: float64 2012-05-31 0.212964 2012-07-31 0.507259 2012-09-30 0.075358 Freq: 2M, dtype: float64 0 2012-05-31 1 2012-07-31 2 2012-09-30 dtype: datetime64[ns] <class ‘pandas._libs.tslibs.timestamps.Timestamp’> Timestamp(‘2012-05-31 00:00:00’, freq=‘2M’)
pandas时间索引值中的频率
rng = pd.date_range(start, end, freq=‘BM’) ''' Bbusiness day frequency Ccustom business day frequency Dcalendar day frequency Wweekly frequency Mmonth end frequency SMsemi-month end frequency (15th and end of month) BMbusiness month end frequency CBMcustom business month end frequency MSmonth start frequency SMSsemi-month start frequency (1st and 15th) BMSbusiness month start frequency CBMScustom business month start frequency Qquarter end frequency BQbusiness quarter end frequency QSquarter start frequency BQSbusiness quarter start frequency A, Yyear end frequency BA, BYbusiness year end frequency AS, YSyear start frequency BAS, BYSbusiness year start frequency BHbusiness hour frequency Hhourly frequency T, minminutely frequency Ssecondly frequency L, msmilliseconds U, usmicroseconds Nnanoseconds '''