Labmanualfds
Labmanualfds
import numpy as np
a=np.array([[1,2,3], [4,5,6]])
b=np.array([[2,3,4], [3,4,5]])
print("array a:\n",a)
print("array b:\n",b)
import numpy as np
a = np.array([[1,2,3],
[4,5,6]])
b = np.array([[10,11,12],
[13,14,15]])
c=a+b
print(c)
#Matrix Multiplication
#Given 2 numpy arrays as matrices, output the result of multiplying the 2 matrices (as a numpy
array)
import numpy as np
a = np.array([[1,2,3],
[4,5,6],
[7,8,9]])
b = np.array([[2,3,4],
[5,6,7],
[8,9,10]])
o = a@b
print(o)
#Syntax:array[start:stop:step]
import numpy as np
a = np.arange(10)
b = a[2:7:2]
print(b)
import numpy as np
a = np.arange(10)
b = a[5]
print(b)
import numpy as np
a = np.arange(10)
print(a[2:])
import numpy as np
a = np.arange(10)
print(a[2:5])
a = np.array([[1,2,3],[3,4,5],[4,5,6]])
print(a)
print(a[...,1])
print('\n')
print(a[1,...])
print('\n')
print(a[...,1:])
print(a[1:,...])
#Integer Indexing
import numpy as np
y = x[[0,1,2], [0,1,0]]
print(y)
import numpy as np
print(x)
rows = np.array([[0,0],[3,3]])
cols = np.array([[0,2],[0,2]])
y = x[rows,cols]
print(y)
import numpy as np
print(x)
# slicing
z = x[1:4,1:3]
print(z)
print('\n')
import numpy as np
print(x)
y = x[1:4,[1,2]]
print(y)
#Expression
import numpy as np
print(x)
z1=x[:2, 1:]
#print(z1)
z2=x[2]
#print(z2)
z3=x[2, :]
#print(z3)
z4=x[2: , :]
#print(z4)
z5=x[:, :2]
#print(z5)
z6=x[1, :2]
#print(z6)
z7=x[1:2, :2]
print(z7)
#Multiple Index
print(arr)
print(arr)
print(arr.T)
print(arr)
print(arr.transpose((1, 0, 2)))
import numpy as np
print(x)
print('\n')
(ufunc).
#Power
import numpy as np
a2 = np.array([3, 5, 6, 8, 2, 33])
print(arrnew)
#Remainder
import numpy as np
a2 = np.array([3, 7, 9, 8, 2, 33])
print(arrnew)
#Remainder
import numpy as np
a2 = np.array([3, 7, 9, 8, 2, 33])
print(arrnew)
import numpy as np
a2 = np.array([3, 7, 9, 8, 2, 33])
print(arrnew)
#Absolute Values
import numpy as np
arr = np.absolute(a)
print(arr)
#Rounding Decimals
#Truncation
#Remove the decimals, and return the float number closest to zero. Use the trunc() and fix()
functions.
import numpy as np
print(arr)
#fix()
import numpy as np
print(arr)
#Rounding
#The around() function increments preceding digit or decimal by 1 if >=0.5 else do nothing.
import numpy as np
arr = np.around(3.1666, 2)
print(arr)
#Floor
import numpy as np
print(arr)
#Ceil
import numpy as np
print(arr)
import numpy as np
sine_of_angles = np.sin(angles)
cosine_of_angles = np.cos(angles)
import numpy as np
a=np.array([[1,2,3], [4,5,6]])
#e. Perform set theory operations such as union, intersection, symmetric difference and fetching
unique values
A = {0, 2, 4, 6, 8};
B = {1, 2, 3, 4, 5};
# union
print("Union :", A | B)
# intersection
# difference
print("Difference :", A - B)
# symmetric difference
print("Symmetric difference :", A ^ B)
#2. Linear Algebra and Random Number generation using linalg and random module in NumPy a)
Compute dot product, vector product and inner product of two arrays.
# 2.a
import numpy as np
p=np.array([1,2,3])
q=np.array([4,7,8])
print(np.dot(p,q))
print(np.vdot(p,q))
print(np.inner(p,q))
#b) Perform matrix operations such as multiplication, determinant, sum of diagonal elements and
inverse.
import numpy as np
a=np.array([[1,2,3],[4,5,6]])
b=np.array([[2,3,4],[3,4,5]])
import numpy as np
r=np.array([[1,2,4], [5,4,3],[2,5,4]])
#c) Compute eigenvalues, eigenvectors and singular value decomposition for a square matrix.
import numpy as np
r=np.array([[1,2,4],[5,4,3],[2,5,4]])
c,d=np.linalg.eigh(r)
print("eigen values:",c)
print("eigen vector:",d)
#3. a)Generate random samples from uniform, normal, binomial, chi-square and Gaussian
distributions using numpy.random functions
np.random.normal(0.0, 1.0, 5)
np.random.chisquare(3, 10)
#3 b). #b. Implement a single random walk with 1000 steps using random module and extract
the statistics like minimum and maximum value along the walk‘s trajectory.
import random
def random_walk(steps):
position = 0
walk = [position]
for _ in range(steps):
position += step
walk.append(position)
return walk
steps = 1000
walk = random_walk(steps)
# Get the minimum and maximum values along the trajectory
min_value = min(walk)
max_value = max(walk)
a. Create DataFrame from List, Dict, List of Dicts, Dicts of Series and perform operations such as
column selection, addition, deletion and row selection, addition and deletion.
import pandas as pd
# List of lists
# Dictionary of lists
df_dict = pd.DataFrame(data_dict)
# List of dictionaries
data_list_dicts = [{'Name': 'Alice', 'Age': 24}, {'Name': 'Bob', 'Age': 27}, {'Name': 'Charlie', 'Age': 22}]
df_list_dicts = pd.DataFrame(data_list_dicts)
df_dict_series = pd.DataFrame(data_dict_series)
name_column = df_list['Name']
second_row = df_dict.iloc[1]
4.b. Create a DataFrame and perform descriptive statistics functions such as sum, mean, median,
mode, standard deviation, skewness, kurtosis, cumulative sum, cumulative product and percent
changes.
import pandas as pd
import numpy as np
# Creating a DataFrame with sample data
data = {
df = pd.DataFrame(data)
print("DataFrame:\n", df)
sum_result = df.sum()
print("\nSum:\n", sum_result)
mean_result = df.mean()
print("\nMean:\n", mean_result)
median_result = df.median()
print("\nMedian:\n", median_result)
mode_result = df.mode()
print("\nMode:\n", mode_result)
skew_result = df.skew()
print("\nSkewness:\n", skew_result)
kurtosis_result = df.kurt()
print("\nKurtosis:\n", kurtosis_result)
cum_sum_result = df.cumsum()
cum_prod_result = df.cumprod()
pct_change_result = df.pct_change()
4.c. Implement the computation of correlation and covariance by considering the DataFrames of
stock prices and volumes obtained from Yahoo Finance! Using pandas-datareader package.
pip install pandas-datareader pandas numpy
import pandas as pd
import numpy as np
start_date = '2023-01-01'
end_date = '2024-01-01'
aapl_volumes = aapl_data['Volume']
msft_volumes = msft_data['Volume']
combined_df = pd.DataFrame({
'AAPL_Price': aapl_prices,
'AAPL_Volume': aapl_volumes,
'MSFT_Price': msft_prices,
'MSFT_Volume': msft_volumes
correlation = combined_df.corr()
# Compute covariance
covariance = combined_df.cov()
print("Correlation matrix:")
print(correlation)
print("\nCovariance matrix:")
print(covariance)
#Interacting with Web APIs and Databases a) Predict the last 30 GitHub issues for pandas using
request and response object’s json method. Move the extracted data to DataFrame and extract elds
of interest. (Use url: https://api.github.com/repos/pandas- dev/pandas/issues)
import requests
import pandas as pd
resp=requests.get('https://reqres.in/api/users')
resp_dict=resp.json() #print(resp_dict)
df=pd.DataFrame(resp_dict.get('data'))
print(df)
#Interacting with Web APIs and Databases a) Predict the last 30 GitHub issues for pandas using
request and response object’s json method. Move the extracted data to DataFrame and extract elds
of interest. (Use url: https://api.github.com/repos/pandas- dev/pandas/issues)
import requests
import pandas as pd
resp=requests.get('https://reqres.in/api/users')
resp_dict=resp.json() #print(resp_dict)
df=pd.DataFrame(resp_dict.get('data'))
print(df)
Let's break down how to work with different data formats using `pandas`. We'll cover CSV, JSON, and
Excel file formats.
To read a CSV file from an online source, you can use the `pd.read_csv()` function. For example, let's
use a publicly available dataset:
```python
import pandas as pd
csv_url = 'https://people.sc.fsu.edu/~jburkardt/data/csv/hw_200.csv'
df = pd.read_csv(csv_url)
print(df.head())
```
You can save a DataFrame to a CSV file using the `to_csv()` method:
```python
# Save DataFrame to a CSV file
```
To read JSON data into a DataFrame, use `pd.read_json()`. You can read JSON from a file or a URL:
```python
json_url = 'https://api.github.com/users/octocat'
df_json = pd.read_json(json_url)
print(df_json)
```
You can write a DataFrame to a JSON file using the `to_json()` method:
```python
```
**Parsing JSON Strings:**
```python
import json
json_string = '''
'''
data = json.loads(json_string)
df_json_from_string = pd.DataFrame(data)
print(df_json_from_string)
```
To read Excel files, use `pd.read_excel()`. You need the `openpyxl` or `xlrd` library to read `.xlsx` or
`.xls` files, respectively:
```python
# URL of an Excel file (example dataset)
excel_url = 'https://people.sc.fsu.edu/~jburkardt/data/xlsx/airtravel.xlsx'
df_excel = pd.read_excel(excel_url)
print(df_excel.head())
```
```python
```
For more advanced usage, you can write multiple DataFrames to different sheets in the same Excel
file using `ExcelWriter`:
```python
```
Data cleaning and transformation are crucial steps in preparing your dataset for analysis. Here’s how
you can perform these tasks using `pandas` in Python:
start by creating a sample DataFrame and then go through the processes of identifying missing data,
handling missing values, and removing duplicates.
```python
import pandas as pd
import numpy as np
data = {
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
```
#### **2. Identifying Missing Data**
```python
missing_data = df.isna()
print("\nMissing Data:")
print(missing_data)
```
You can drop rows or columns with missing values using `dropna()`:
```python
df_dropped_rows = df.dropna()
df_dropped_columns = df.dropna(axis=1)
print(df_dropped_rows)
print(df_dropped_columns)
```
**b. Filling Missing Data**
```python
df_filled_value = df.fillna(value=0)
df_filled_mean = df.fillna(df.mean())
print(df_filled_value)
print(df_filled_mean)
```
You can also use forward fill (`ffill`) and backward fill (`bfill`) methods:
```python
df_filled_ffill = df.fillna(method='ffill')
df_filled_bfill = df.fillna(method='bfill')
print(df_filled_bfill)
```
```python
duplicates = df.duplicated()
print("\nDuplicate Rows:")
print(duplicates)
df_no_duplicates = df.drop_duplicates()
print(df_no_duplicates)
```
Data transformation in pandas involves modifying values, renaming columns or indices, and ensuring
that the original dataset remains unchanged when necessary. Here's how you can perform these
transformations:
The `map()` method is useful for element-wise transformations in a DataFrame or Series. For
example, you might want to transform categorical values or apply a function to each element.
```python
import pandas as pd
data = {
df = pd.DataFrame(data)
def transform_value(x):
df['B_transformed'] = df['B'].map(str.upper)
print(df)
```
The `replace()` method is useful for replacing specific values with new values.
```python
print(df_replaced)
```
### **2. Creating a Transformed Version Without Modifying the Original Dataset Using `rename`**
The `rename()` method allows you to change column or index names without modifying the original
dataset.
```python
print("\nOriginal DataFrame:")
print(df)
print(df_renamed)
```
Data Preparation
a. Create a DataFrame with normally distributed data using random sampling and detect
possible outliers.
To create a DataFrame with normally distributed data and detect possible outliers, follow these
steps:
We will use `numpy` to generate normally distributed data and `pandas` to create the DataFrame.
```python
import pandas as pd
import numpy as np
np.random.seed(42)
mean = 0
std_dev = 1
data = {
# Create DataFrame
df = pd.DataFrame(data)
print(df.head())
```
There are several methods to detect outliers. We'll use the following methods:
- **Z-Score Method**: This method uses the standard deviation and mean to identify outliers.
- **Interquartile Range (IQR) Method**: This method identifies outliers based on the IQR, which is
the range between the first and third quartiles.
The Z-score measures how many standard deviations a data point is from the mean. Typically, data
points with Z-scores greater than 3 or less than -3 are considered outliers.
```python
z_scores = np.abs(stats.zscore(df))
outlier_indices = np.where(outliers_z_score.any(axis=1))[0]
print(df.iloc[outlier_indices])
```
The IQR method identifies outliers by calculating the range between the first quartile (Q1) and the
third quartile (Q3). Values outside 1.5 times the IQR from Q1 and Q3 are considered outliers.
```python
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
# Calculate IQR
IQR = Q3 - Q1
# Determine outliers
outliers_iqr = ((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR)))
outlier_indices_iqr = df[outliers_iqr.any(axis=1)].index
print("\nOutliers detected using IQR method:")
print(df.loc[outlier_indices_iqr])
```
b. Perform text manipulation with regular expression by applying relevant regular expression
methods to split a string with a variable number of whitespace characters (tabs, spaces, and
newlines) and get a list of all patterns matching
Text manipulation with regular expressions (regex) in Python is powerful for extracting, splitting, and
finding patterns in strings. To handle splitting a string with variable whitespace characters and
finding all matching patterns, you can use the `re` module.
First, import the `re` module which provides support for regular expressions.
```python
import re
```
Create a string with variable whitespace characters including tabs, spaces, and newlines.
```python
text = """
This is a sample text
characters.
Here\tis\tanother line
"""
```
To split the string by any whitespace character (spaces, tabs, and newlines), use the `re.split()`
method with the regular expression `\s+` which matches one or more whitespace characters.
```python
print("Split text:")
print(split_text)
```
If you want to find all occurrences of a specific pattern in the string, use the `re.findall()` method. For
example, to find all words (sequences of non-whitespace characters), you can use the regular
expression `\S+`.
```python
```
```python
import re
text = """
characters.
Here\tis\tanother line
"""
print("Split text:")
print(split_text)
print(words)
```
Let's go through each of the data visualization tasks using Matplotlib and Seaborn. We'll use a
sample online dataset from `seaborn` for demonstration purposes.
To create a line plot with Matplotlib, including setting titles, axis labels, ticks, tick labels, and
annotations, follow these steps:
```python
df = sns.load_dataset('flights')
# Pivot the dataset to get the data in the right format for line plot
plt.figure(figsize=(12, 6))
sns.lineplot(data=pivot_df, dashes=False)
plt.xlabel('Year')
plt.ylabel('Number of Passengers')
plt.xticks(rotation=45)
# Annotate a point (example: annotate the point for the year 1950, month January)
plt.annotate('1950 Jan', xy=(0, pivot_df.loc['January', 1950]), xytext=(0, 600),
arrowprops=dict(facecolor='black', shrink=0.05))
plt.savefig('line_plot.png')
plt.show()
```
```python
df = pd.DataFrame({
'C': [5, 3, 7]
plt.xlabel('Index')
plt.ylabel('Values')
plt.savefig('bar_plot.png')
plt.show()
```
```python
plt.xlabel('Index')
plt.ylabel('Values')
plt.savefig('stacked_bar_plot.png')
plt.show()
```
```python
df = sns.load_dataset('iris')
# Create a histogram
plt.figure(figsize=(12, 6))
plt.xlabel('Sepal Length')
plt.ylabel('Frequency')
plt.savefig('histogram.png')
plt.show()
plt.figure(figsize=(12, 6))
sns.kdeplot(df['sepal_length'], shade=True)
plt.xlabel('Sepal Length')
plt.ylabel('Density')
plt.savefig('density_plot.png')
plt.show()
```
```python
df = sns.load_dataset('penguins')
plt.figure(figsize=(12, 6))
plt.show()
```
```python
df = sns.load_dataset('tips')
plt.figure(figsize=(12, 6))
plt.xlabel('Day')
plt.ylabel('Total Bill')
plt.savefig('box_plot.png')
plt.show()
```
- **Bar Plots**: Use `df.plot(kind='bar')` for grouped bars and `df.plot(kind='bar', stacked=True)` for
stacked bars.
- **Histogram and Density Plot**: Use `sns.histplot()` for histograms and `sns.kdeplot()` for density
plots.
- **Scatter Plot**: Use `sns.scatterplot()` to visualize the relationship between two variables.
- **Box Plot**: Use `sns.boxplot()` to visualize data distribution and outliers.
#9.
c. Generate data ranges by setting time zone, localize time zone and convert to particular time
zone using tz_convert and combine two different time zones.
d. Perform period arithmetic such as adding and subtracting integers from periods and
construct range of periods using period_range function.
e. Convert Periods and PeriodIndex objects to another frequency with asfreq method.
f. Convert Series and DataFrame objects indexed by timestamps to periods with the to_period
method.
Here's how you can perform various time series analyses using pandas:
### **a. Create Time Series Using Datetime Object Indexed by Timestamps**
To create a time series with a datetime index, you can use `pd.date_range` and then create a
DataFrame or Series.
```python
import pandas as pd
import numpy as np
data = np.random.randn(len(date_range))
print(df)
```
You can use `pd.date_range` to generate a `DatetimeIndex` with a specified length and frequency.
```python
print("Generated DatetimeIndex:")
print(datetime_index)
```
You can set, localize, and convert time zones using pandas.
```python
# Create a timezone-naive datetime index
localized_dates = naive_dates.tz_localize('UTC')
converted_dates = localized_dates.tz_convert('America/New_York')
print(localized_dates)
print(converted_dates)
df_timezone = pd.DataFrame({
'UTC': localized_dates,
'New_York': converted_dates
})
print(df_timezone)
```
Period arithmetic can be performed by adding or subtracting periods, and you can use
`pd.period_range` to create ranges of periods.
```python
# Create a period range
print("Period Range:")
print(period_range)
print("\nPeriod Arithmetic:")
```
You can convert periods and `PeriodIndex` objects to another frequency using the `asfreq` method.
```python
# Create a PeriodIndex
quarterly_index = period_index.to_timestamp().to_period('Q')
print("\nQuarterly PeriodIndex:")
print(quarterly_index)
```
### **f. Convert Series and DataFrame Indexed by Timestamps to Periods**
```python
period_df = timestamp_df.to_period('M')
print(timestamp_df)
print(period_df)
```
Resampling, downsampling, and upsampling can be done using the `resample` method.
```python
data = np.random.randn(len(date_range))
ts = pd.Series(data, index=date_range)
monthly_ts = ts.resample('M').mean()
# Upsample to hourly frequency
hourly_ts = ts.resample('H').asfreq()
print(ts.head())
print(monthly_ts.head())
print(hourly_ts.head())
```
- **Time Series Creation**: Use `pd.date_range` to create a time series with a datetime index.
- **Time Zones**: Localize, convert, and combine time zones using `tz_localize` and `tz_convert`.
- **Period Arithmetic**: Add or subtract periods and create period ranges with `pd.period_range`.
- **Resampling**: Use `resample` for downsampling and upsampling time series data.
Let's dive into data aggregation with pandas. We’ll work with creating a DataFrame, grouping by
single and multiple keys, computing summary statistics, and performing exploratory data analysis.
### **a. Create a Tabular Dataset and Split Data into Groups**
**Creating a DataFrame:**
```python
import pandas as pd
import numpy as np
data = {
'Category': ['A', 'B', 'A', 'B', 'A', 'B', 'C', 'C', 'A', 'B'],
'Subcategory': ['X', 'Y', 'X', 'Y', 'Z', 'X', 'Y', 'Z', 'Z', 'X'],
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
```
```python
grouped_single = df.groupby('Category')
print("\nGrouped by 'Category':")
print(f"\nGroup '{name}':")
print(group)
```
**Group by Multiple Keys:**
```python
print(f"\nGroup {name}:")
print(group)
```
**Compute Aggregates:**
```python
# Compute summary statistics (sum, mean, std) for the grouped data
print(aggregated_data)
```
We'll use an online dataset for this example. Let's use the `seaborn` library's built-in dataset "tips".
```python
df_tips = sns.load_dataset('tips')
print(df_tips.head())
```
```python
grouped_day = df_tips.groupby('day').agg({
})
print(grouped_day)
```
```python
'size': 'mean'
})
print("\nAggregated Data by 'day' and 'time':")
print(grouped_day_time)
```
```python
basic_stats = df_tips.describe(include='all')
print(basic_stats)
plt.figure(figsize=(10, 6))
plt.show()
```
- **Grouping**: