Pandas Session 6 Code

Pandas Session 6 Code For Video Click Fahad Hussain CS


Hi, Welcome to Fahad Hussain CS blog, do not forget to subscribe and follow the blog and YouTube channel  Fahad Hussain CS

Code:

## Working with Delimited Formats


# Data Cleaning and Preparation


## Handling Missing Data

"""


import pandas as pd

import numpy as np

string_data = pd.Series(['Fahad', 'Hussain', np.nan, 'CS'])

print(string_data)

print(string_data.isnull())


string_data[0] = None

print(string_data.isnull())


"""### Filtering Out Missing Data"""


from numpy import nan as NA

data = pd.Series([1, NA, 3.5, NA, 7])

print(data.dropna())


data[data.notnull()]


data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],

                     [NA, NA, NA], [NA, 6.5, 3.]])

cleaned = data.dropna()

print(data)

print(cleaned)


data.dropna(how='all')


data[1]


data[1] = NA

print(data)

print(data.dropna(axis=1, how='all'))


df = pd.DataFrame(np.random.randn(7, 3))

df.iloc[:4, 1] = NA

df.iloc[:2, 2] = NA

print(df)

print(df.dropna())

print(df.dropna(thresh=2))


"""### Filling In Missing Data"""


df.fillna(0)


df.fillna({1: 0.5, 2: 0})


_ = df.fillna(0, inplace=True)

df


df = pd.DataFrame(np.random.randn(6, 3))

df.iloc[2:, 1] = NA

df.iloc[4:, 2] = NA

df


df.fillna(method='ffill')


df.fillna(method='ffill', limit=2)


data = pd.Series([1., NA, 3.5, NA, 7])

print(data)

data.fillna(data.median())


"""## Data Transformation


### Removing Duplicates

"""


data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],

                     'k2': [1, 1, 2, 3, 3, 4, 4]})

data


data.duplicated()


data.drop_duplicates()


data['v1'] = range(7)

print(data)

data.drop_duplicates(['k1'])


print(data)

data.drop_duplicates(['k1', 'k2'], keep='last')


"""### Transforming Data Using a Function or Mapping"""


data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',

                              'Pastrami', 'corned beef', 'Bacon',

                              'pastrami', 'honey ham', 'nova lox'],

                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

data


meat_to_animal = {

  'bacon': 'pig',

  'pulled pork': 'pig',

  'pastrami': 'cow',

  'corned beef': 'cow',

  'honey ham': 'pig',

  'nova lox': 'salmon'

}


lowercased = data['food'].str.lower()

print(lowercased)

data['animal'] = lowercased.pma(meat_to_animal)

print(data)


data['food'].map(lambda x: meat_to_animal[x.lower()])


"""### Replacing Values"""


data = pd.Series([1., -999., 2., -999., -1000., 3.])

data


data.replace(-999, np.nan)


data.replace([-999, -1000], np.nan)


data.replace([-999, -1000], [np.nan, 0])


data.replace({-999: np.nan, -1000: 0})


"""### Renaming Axis Indexes"""


data = pd.DataFrame(np.arange(12).reshape((3, 4)),

                    index=['Ohio', 'Colorado', 'New York'],

                    columns=['one', 'two', 'three', 'four'])

data


transform = lambda x: x[:4].upper()

data.index.map(transform)


data.index = data.index.map(transform)

data


data.rename(index=str.title, columns=str.upper)


data.rename(index={'OHIO': 'INDIANA'},

            columns={'three': 'peekaboo'})


data.rename(index={'OHIO': 'INDIANA'}, inplace=True)

data


ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]


bins = [18, 25, 35, 60, 100]

cats = pd.cut(ages, bins)

cats


cats.codes

cats.categories

pd.value_counts(cats)


pd.cut(ages, [18, 26, 36, 61, 100], right=False)


group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']

pd.cut(ages, bins, labels=group_names)


data = np.random.rand(20)

pd.cut(data, 4, precision=2)


data = np.random.randn(1000)  # Normally distributed

cats = pd.qcut(data, 4)  # Cut into quartiles

cats

pd.value_counts(cats)


pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])


"""##Dummy Variables"""


import numpy as np

import pandas as pd


df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],

                   'data1': range(6)})

print(df)

print(pd.get_dummies(df['key']))


dummies = pd.get_dummies(df['key'], prefix='key')

df_with_dummy = df[['data1']].join(dummies)

df_with_dummy


from google.colab import files


uploaded = files.upload()


for fn in uploaded.keys():

  print('User uploaded file "{name}" with length {length} bytes'.format(

      name=fn, length=len(uploaded[fn])))


mnames = ['movie_id', 'title', 'genres']

movies = pd.read_table('movies.dat', sep='::', header=None, names=mnames)

movies[:10]


all_genres = []

for x in movies.genres:

    all_genres.extend(x.split('|'))

genres = pd.unique(all_genres)


genres


zero_matrix = np.zeros((len(movies), len(genres)))

dummies = pd.DataFrame(zero_matrix, columns=genres)


dummies


gen = movies.genres[0]

gen.split('|')

dummies.columns.get_indexer(gen.split('|'))


for i, gen in enumerate(movies.genres):

    indices = dummies.columns.get_indexer(gen.split('|'))

    dummies.iloc[i, indices] = 1


movies_windic = movies.join(dummies.add_prefix('Genre_'))

movies_windic.iloc[0]


np.random.seed(12345)

values = np.random.rand(10)

values

bins = [0, 0.2, 0.4, 0.6, 0.8, 1]

pd.get_dummies(pd.cut(values, bins))


No comments:

Post a Comment

Fell free to write your query in comment. Your Comments will be fully encouraged.