Pandas
Frequently used code for pandas related code snippets
plot columns
df.plot(figsize=(20,10))
fill na
dt.fillna(0)
Reading data from Postgresql
import pandas as pd
from sqlalchemy import create_engine
engine = create_engine("postgresql://user@localhost:5432/mydb")
df = pd.read_sql_query("select * from table", con=engine)
Add rows
dt.append({"col1":[1,2,3], "coln":[4,5,6]})
pivoting dataframe
df.pivot(columns="col_with_columns",values="col_with_values")
getting index of max/min values
df['column'].idxmax()
Count instances of values in column
df['target'].value_counts().sort_values(ascending=False)
Filtering by column's value len
df[df['itemsets'].map(len) > 2]
Iterate by rows
for index, row in df.iterrows():
print(row['c1'], row['c2'])
Subsetting columns
df[['col1', 'col2']]
or by regex pattern
df.filter(regex=("d.*"))
Parse datetimes
# errors = ['coerce','raise','ignore'] # default: 'raise'
df['date'] = pd.to_datetime(df.date, format='%Y-%m-%d')
Group by dates
df[df.date.dt.day == 8] #(or hour or minute)
Plot evolution over time
%matplotlib inline
import matplotlib.pyplot as plt
fig=plt.figure(figsize=(18, 13), dpi= 80, facecolor='w', edgecolor='k')
ax = plt.gca()
hist_dt[hist_dt["target_community"]==0].groupby([hist_dt.created_at.dt.day,hist_dt.created_at.dt.hour]).size().plot(ax=ax)
# if multiple samples: hist_dt[hist_dt["target_community"]==1].groupby([hist_dt.created_at.dt.day,hist_dt.created_at.dt.hour]).size().plot(ax=ax)
# if not the subsetting part could be removed
plt.xlabel('Dia,Hora de publicación')
plt.ylabel('Cantidad de tweets')
New column A based on column B
df['Discounted_Price'] = df.apply(lambda row: row.Cost -
(row.Cost * 0.1), axis = 1)
#if row.property == 1 else None # for conditional setting
To select rows whose column value equals a scalar, some_value, use ==:
df.loc[df['column_name'] == some_value]
To select rows whose column value is in an iterable, some_values, use isin:
df.loc[df['column_name'].isin(some_values)]
To select rows whose column value is in another column array
df.apply(lambda x: x['Responsibility Type'] in x['Roles'], axis=1)
Combine multiple conditions with &:
df.loc[(df['column_name'] >= A) & (df['column_name'] <= B)]
Note the parentheses. Due to Python's operator precedence rules, & binds more tightly than <= and >=. Thus, the parentheses in the last example are necessary. Without the parentheses
df['column_name'] >= A & df['column_name'] <= B
is parsed as
df['column_name'] >= (A & df['column_name']) <= B
To select rows whose column value does not equal some_value, use !=:
df.loc[df['column_name'] != some_value]
To select rows whose value is not in some_values
df.loc[~df['column_name'].isin(some_values)]
Improve query efficiency by setting an index
df = df.set_index(['colname'])
Order columns lexicographically
df = df.reindex(sorted(df.columns), axis=1)
Remove duplicates (distinct)
df.drop_duplicates()# subset=["col1", "col2"] use only those columns for distinction
Check if a field exists
df[df['column'].isnull()]
#or
df[df['column'].notnull()]
Replacing column values
w['female'] = w['female'].map({'female': 1, 'male': 0})
#or
dt.loc[dt['ccLeadtime'] == -1, 'ccLeadtime'] = 0
One-hot encoding
dfDummies = pd.get_dummies(df['categorical'], prefix = 'category')
df = pd.concat([df, dfDummies], axis=1)
Rename column
df = df.rename(columns={"A": "a", "B": "c"})
Rename index names
df.index.names = ['new_name']
Remove column from list
df.drop(['pop'], axis=1)
Join dataframes
pd.merge(df1,df2,on='key')
Iterate by groups
grouped = df.groupby('A')
for name, group in grouped:
Group data by year
data.groupby(data.date.dt.year)
Count unique by group
df.groupby('location')['user'].nunique()
Collect list in Group by
df.groupby('a')['b'].apply(list).reset_index(name='new')
Get normalized values from groupby count
df2 = df.groupby(['subset_product', 'subset_close']).size().reset_index(name='prod_count')
a = df2.groupby('subset_product')['prod_count'].transform('sum')
df2['prod_count'] = df2['prod_count'].div(a)
Generate csv from dataframe
df.to_csv('filename.csv',index=False)
Sum to date column
df['date'] = df['created'] + pd.to_timedelta(7, unit='d')
Replace character in column
df["column"] = df["column"].str.replace('foo', 'bar')
References
Last updated
Was this helpful?