Introduction to pandas (6)

Time:2021-2-20
import pandas as pd
gl=pd.read_csv('./pandas/data/game_logs.csv')
#Memory usage of data
gl.info(memory_usage='deep')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171907 entries, 0 to 171906
Columns: 161 entries, date to acquisition_info
dtypes: float64(77), int64(6), object(78)
memory usage: 859.4 MB


for dtype in ['float64','object','int64']:
    selected_dtype=gl.select_dtypes(include=[dtype])
    memory_usage_b=selected_dtype.memory_usage(deep=True).mean()
    memory_usage_mb=memory_usage_b/1024/1024
    print('[%s] memory usage %0.2f MB' % (dtype,memory_usage_mb))
[float64] memory usage 1.29 MB
[object] memory usage 9.50 MB
[int64] memory usage 1.12 MB


#Value range of uint8 int8 int16 int32 Int64
import numpy as np
for dtype in ['uint8','int8','int16','int32','int64']:
    print(np.iinfo(dtype))
Machine parameters for uint8
---------------------------------------------------------------
min = 0
max = 255
---------------------------------------------------------------

Machine parameters for int8
---------------------------------------------------------------
min = -128
max = 127
---------------------------------------------------------------

Machine parameters for int16
---------------------------------------------------------------
min = -32768
max = 32767
---------------------------------------------------------------

Machine parameters for int32
---------------------------------------------------------------
min = -2147483648
max = 2147483647
---------------------------------------------------------------

Machine parameters for int64
---------------------------------------------------------------
min = -9223372036854775808
max = 9223372036854775807
---------------------------------------------------------------



#The converted data takes up memory
def mem_usage(data):
    if isinstance(data,pd.DataFrame):
        mem_b=data.memory_usage(deep=True).sum()
    else:
        mem_b=data.memory_usage(deep=True)
    return "{:03.2f} MB".format(mem_b/1024**2)

gl_int64=gl.select_dtypes(include=['int64'])

#Down type conversion
gl_int32=gl_int.apply(pd.to_numeric,downcast='unsigned')
print(mem_usage(gl_int64))
print(mem_usage(gl_int32))

#Float64 to float
gl_float64=gl.select_dtypes(include=['float64'])
gl_float=gl_float64.apply(pd.to_numeric,downcast='float')

Print ("before conversion): + mem_ usage(gl_ float64))
Print ("after conversion" + mem_ usage(gl_ float))
7.87 MB
1.48 MB
Before conversion: 100.99 MB
50.49 MB after conversion
opt_gl=gl.copy()
opt_gl[gl_int32.columns]=gl_int32
opt_gl[gl_float.columns]=gl_float
Print ("size of original data): + mem_ usage(gl))
Print ("converted data size): + mem_ usage(opt_ gl))
Original data size: 859.43 MB
Converted data size: 802.54 MB
gl_obj=gl.select_dtypes(include=['object']).copy()
print(gl_obj.describe())
       day_of_week  v_name v_league  h_name h_league day_night  \
count       171907  171907   171907  171907   171907    140150   
unique           7     148        7     148        7         2   
top            Sat     CHN       NL     CHN       NL         D   
freq         28891    8870    88866    9024    88867     82724   

                   completion forefeit protest park_id  ... h_player_6_id  \
count                     116      145     180  171907  ...        140838   
unique                    116        3       5     245  ...          4774   
top     19590602,PIT06,2,1,39        H       V   STL07  ...      grimc101   
freq                        1       69      90    7022  ...           427   

       h_player_6_name h_player_7_id h_player_7_name h_player_8_id  \
count           140838        140838          140838        140838   
unique            4720          5253            5197          4760   
top      Charlie Grimm      grimc101   Charlie Grimm      lopea102   
freq               427           491             491           676   

       h_player_8_name h_player_9_id h_player_9_name additional_info  \
count           140838        140838          140838            1456   
unique            4710          5193            5142             332   
top           Al Lopez      spahw101    Warren Spahn            HTBF   
freq               676           339             339            1112   

       acquisition_info  
count            140841  
unique                1  
top                   Y  
freq             140841  

[4 rows x 78 columns]


dow=gl_obj.day_of_week
print(dow.head())
dow_cat=dow.astype('category')
print(dow_cat.head())
Print ("before conversion" + mem_ usage(dow))
Print ("after conversion" + mem_ usage(dow_ cat))
#The data with more repetition is converted into category to reduce the data memory
convert_obj=pd.DataFrame()
for col in gl_obj.columns:
    num_unique=len(gl_obj[col].unique())
    num_total=len(gl_obj[col])
    if num_unique/num_total<0.5:
        convert_obj.loc[:,col]=gl_obj[col].astype('category')
    else:
        convert_obj.loc[:,col]=gl_obj[col]

Print ('before data conversion: '+ mem_ usage(gl_ obj))
Print ('after data conversion: '+ mem_ usage(convert_ obj))
opt_gl[convert_obj.columns]=convert_obj
print(mem_usage(opt_gl))
#Apply operation
titanic=pd.read_csv('./pandas/data/titanic_train.csv')
titanic.iloc[99]
#Get 99 rows of data
def get_row(data):
    return data.iloc[99]
row=titanic.apply(get_row)
row
#Count the number of Nan in each column
def get_null_count(data):
    col_null=pd.isnull(data)
    null=data[col_null]
    return len(null)
null_count=titanic.apply(get_null_count)
print(null_count)
#Data conversion
def which_class(row):
    pclass=row['Pclass']
    if pd.isnull(pclass):
        return "UnKown"
    elif pclass == 1:
        return "One"
    elif pclass == 2:
        return "Tow"
    elif pclass == 3:
        return "Three"
classes=titanic.apply(which_class,axis=1)
print(classes)
#Find out the underage data
def is_minor(row):
    age=row['Age']
    if age<18:
        return True
    else:
        return False

minor=titanic.apply(is_minor,axis=1)
print(titanic[minor])

Introduction to pandas (6)