import numpy as np
import pandas as pd

np.random.seed(1)  # set random seed to get the same random series

np.random.seed(0)
df = pd.DataFrame({'foo': np.random.randint(3,size=7), 'bar': np.random.randint(3,size=7), 'baz': np.random.randint(3,size=7)})
df.iloc[1] = df.iloc[0]
df

df.drop_duplicates(subset=["foo", "baz"], inplace=True)
df

df = pd.DataFrame({'foo': np.random.randint(5,size=5), 'bar': np.random.randint(5,size=5)})
df.iloc[1] = df.iloc[0]
df.duplicated()

0    False
1     True
2    False
3    False
4    False
dtype: bool

df.drop(columns='foo')

df.drop(index=[2,3])  # 2 & 3 are labels

to_be_dropped = df[df.foo % 2 == 0].index
print("To be dropped:", to_be_dropped)
df.drop(index = to_be_dropped)

To be dropped: Index([3], dtype='int64')

df = df.drop(index=1)
df.at[0,'bar'] = 0 
df.at[0,'foo'] = 0      # to get a Not a Number after division
df.at[1,'foo'] = 3      # makes df.at[1,'bar'] = Nan since line 1 has just been removed
df.at[2,'bar'] = None   # None makes NaN
df['div'] = df.bar / df.foo
df

df.isna()

df.bar = df.bar.fillna(7)
df

df.dropna(inplace=True)
df

dates = pd.date_range('2016-08-01', periods=8, freq='D')
temperature = pd.DataFrame({'temp': [21.5, 24, 25.5, None, 25.2, None, None, 20.1]}, index=dates)
temperature.drop(temperature.index[2], inplace=True) # so index is not linear anymore
temperature

temperature.ffill(limit=1) # forward fill (backward is bfill)

temperature.interpolate(method='linear')

temperature.interpolate(method='time').iloc[2]

temp    24.8
Name: 2016-08-04 00:00:00, dtype: float64

df2 = pd.DataFrame({'foo': np.random.randint(10,size=5000), 'bar': np.random.randint(10,size=5000)})

%timeit df2.replace([1,2],[11,12])   # replace 1 and 2 by 11 and 12 respectively
%timeit df2.replace([3,4],134)       # replace 3 and 4 by 134
%timeit df2[df2==5] = 105
%timeit df2[(df2==6) | (df2==7)] = 167

132 μs ± 1.89 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
133 μs ± 2.68 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
226 μs ± 1.41 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
342 μs ± 9.21 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

	foo	bar	div
0	0.0	0.0	NaN
2	3.0	7.0	NaN
3	0.0	0.0	NaN
4	3.0	1.0	0.333333
1	3.0	7.0	NaN

	temp
2016-08-01	21.5
2016-08-02	24.0
2016-08-04	24.6
2016-08-05	25.2
2016-08-06	23.5
2016-08-07	21.8
2016-08-08	20.1

Clean up your data¶

Remove duplicates¶

Remove unnecessary data¶

Manage NaN¶

Estimation of NaN¶

Replace¶

	foo	bar	div
0	False	False	True
2	False	True	True
3	False	False	True
4	False	False	False
1	False	True	True