1.5
1.5
print(df)
Pandas DataFrame Using Python Dictionary
data = {’year’: [2010 , 2011 , 2012 ,
2010 , 2011 , 2012 ,2010 , 2011 , 2012],
’team’: [’FCBarcelona’, ’FCBarcelona’, ’FCBarcelona’,
’RMadrid ’, ’RMadrid’, ’RMadrid’, ’ValenciaCF’,
’ValenciaCF’, ’ValenciaCF’],
’wins’: [30 , 28 , 32 , 29 , 32 , 26 , 21 , 17 , 19],
’draws’:[6 , 7, 4, 5, 4, 7, 8, 10 , 8] ,
’losses’: [2 , 3, 2, 4, 2, 5, 9, 11 , 11]
}
football = pd.DataFrame(data,columns=[’year’,’team’,
’wins’, ’draws’, ’losses’] )
df = pd.DataFrame() # create an empty DataFrame
df = pd.read_csv('data.csv') #from CSV
df = pd.read_csv('./csv_files/data.csv', header = 0)
name_city = df[['Name','City']]
df2 = pd.DataFrame(
{"A": 1.0, "B":pd.Timestamp("20250128"),
"C": pd.Series(1,index=list(range(4)),
dtype="float32"),
"D": np.array([3] * 4, dtype="int32"),
"E": pd.Categorical(["test", "train", "test",
"train"]),
"F": "foo", } )
>>>df2
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo
>>> df2.dtypes
A float64
B datetime64[s]
C float32
D int32
E category
F object
dtype: object
>>> dates = pd.date_range("20250101", periods=6)
df = pd.DataFrame(np.random.randn(6, 4),
>>>
index=dates, columns=list("ABCD"))
>>> df
A B C D
2025-01-01 0.293879 0.324915 0.434401 -1.391992
2025-01-02 -0.701108 -0.011810 0.835216 -0.586246
2025-01-03 -0.677587 0.348766 -0.457098 1.147319
2025-01-04 -1.671191 0.651669 -0.685242 -1.954809
2025-01-05 0.526734 -1.297472 0.177927 0.612196
2025-01-06 0.778206 0.865262 -0.970947 -0.460400
>>> df.head()
A B C D
2025-01-01 0.293879 0.324915 0.434401 -1.391992
2025-01-02 -0.701108 -0.011810 0.835216 -0.586246
2025-01-03 -0.677587 0.348766 -0.457098 1.147319
2025-01-04 -1.671191 0.651669 -0.685242 -1.954809
2025-01-05 0.526734 -1.297472 0.177927 0.612196
>>> df.tail(2)
A B C D
2025-01-05 0.526734 -1.297472 0.177927 0.612196
2025-01-06 0.778206 0.865262 -0.970947 -0.460400
>>> df.index
DatetimeIndex(['2025-01-01', '2025-01-02',
'2025-01-03', '2025-01-04','2025-01-05',
'2025-01-06'],dtype='datetime64[ns]', freq='D')
>>> df.columns
Index(['A', 'B', 'C', 'D'], dtype='object')
>>> df.to_numpy()
array([[ 0.29387942, 0.32491506, 0.43440078,-1.39199244],
[-0.70110762,-0.01181039, 0.83521647, -0.58624567],
[-0.67758743, 0.34876597, -0.45709763, 1.14731948],
[-1.67119052, 0.65166926, -0.68524221, -1.95480876],
[ 0.52673407,-1.29747191, 0.17792695, 0.6121957 ],
[ 0.77820621,0.8652619 , -0.97094701, -0.46040001]])
>>> df.describe()
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean -0.241844 0.146888 -0.110957 -0.438989
std 0.934028 0.768723 0.702184 1.170421
min -1.671191 -1.297472 -0.970947 -1.954809
25% -0.695228 0.072371 -0.628206 -1.190556
50% -0.191854 0.336841 -0.139585 -0.523323
75% 0.468520 0.575943 0.370282 0.344047
max 0.778206 0.865262 0.835216 1.147319
>>> df.T
2025-01-01 2025-01-02 2025-01-03 2025-01-04 2025-01-05 2025-01-06
A 0.293879 -0.701108 -0.677587 -1.671191 0.526734 0.778206
B 0.324915 -0.011810 0.348766 0.651669 -1.297472 0.865262
C 0.434401 0.835216 -0.457098 -0.685242 0.177927 -0.970947
D -1.391992 -0.586246 1.147319 -1.954809 0.612196 -0.460400
>>> df["A"]
2025-01-01 0.293879
2025-01-02 -0.701108
2025-01-03 -0.677587
2025-01-04 -1.671191
2025-01-05 0.526734
2025-01-06 0.778206
Freq: D, Name: A, dtype: float64
>>> df.A
2025-01-01 0.293879
2025-01-02 -0.701108
2025-01-03 -0.677587
2025-01-04 -1.671191
2025-01-05 0.526734
2025-01-06 0.778206
Freq: D, Name: A, dtype: float64
data = {'Name': ['John', 'Alice', 'Bob'],
'Age': [25, 30, 35],
'City': ['New York', 'London', 'Paris']}
# create a dataframe from the dictionary
df = pd.DataFrame(data)
# write dataframe to csv file
df.to_csv('output.csv', index=False)
df = pd.DataFrame(data)
df.duplicated(subset=['Name', 'Age']
df.drop_duplicates(inplace=True)
import pandas as pd
# create dataframe
data = {'Name': ['Tom', 'Nick', 'John', 'Tom'],
'Age': [20, 21, 19, 18],
'City': ['New York', 'London', 'Paris', 'Berlin']}
df = pd.DataFrame(data)
df = pd.DataFrame(data)
print("Original Data:\n",df)
# use dropna() to remove rows with any missing values
df_cleaned = df.dropna()
print("Cleaned Data:\n",df_cleaned)
Cleaned Data:
A B C
1 2.0 2.0 2.0
4 5.0 5.0 5.0
import pandas as pd
data = { 'A': [1, 2, 3, None, 5],
'B': [None, 2, 3, 4, 5], 'C': [1, 2, None, None, 5]}
df = pd.DataFrame(data)
print("Original Data:\n", df)
# filling NaN values with 0
df.fillna(0, inplace=True)
print("\nData after filling NaN with 0:\n", df)
import pandas as pd
data = {
'Name': ['John', 'Michael', 'Tom', 'Alex', 'Ryan'],
'Age': [8, 9, 7, 80, 100], 'Gender': ['M', 'M', 'M', 'F', 'M'],
'Standard': [3, 4, 12, 3, 5]}
df = pd.DataFrame(data)
# replace F with M
df.loc[3, 'Gender'] = 'M'
print(df)
import pandas as pd
data = {
'Name': ['John', 'Michael', 'Tom', 'Alex', 'Ryan'],
'Age': [8, 9, 7, 80, 100], 'Gender': ['M', 'M', 'M', 'M', 'M'],
'Standard': [3, 4, 12, 3, 5] }
df = pd.DataFrame(data)
# replace values based on conditions
for i in df.index:
age_val = df.loc[i, 'Age']
if (age_val > 14) and (age_val%10 == 0):
df.loc[i, 'Age'] = age_val/10
print(df)
Resources: Datasets
39