0% found this document useful (0 votes)
9 views

Labmanualfds

Uploaded by

vrt.kolluru
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
9 views

Labmanualfds

Uploaded by

vrt.kolluru
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 49

#Addition, Subtraction, Multiplication and Division

import numpy as np

a=np.array([[1,2,3], [4,5,6]])

b=np.array([[2,3,4], [3,4,5]])

print("array a:\n",a)

print("array b:\n",b)

print("addition of two arrays:\n",np.add(a,b))

print("subtraction of two arrays:\n",np.subtract(a,b))

print("multiplication of two arrays:\n",np.multiply(a,b))

print("division of two arrays:\n",np.divide(a,b))

#Element-wise addition of 2 numpy arrays

import numpy as np

a = np.array([[1,2,3],

[4,5,6]])

b = np.array([[10,11,12],

[13,14,15]])

c=a+b

print(c)

#Matrix Multiplication

#Given 2 numpy arrays as matrices, output the result of multiplying the 2 matrices (as a numpy
array)

import numpy as np

a = np.array([[1,2,3],

[4,5,6],

[7,8,9]])

b = np.array([[2,3,4],

[5,6,7],
[8,9,10]])

o = a@b

print(o)

1.b). Perform slicing and indexing on multi-dimensional arrays

#Slicing Range of Items

#Syntax:array[start:stop:step]

import numpy as np

a = np.arange(10)

b = a[2:7:2]

print(b)

#slice single item

import numpy as np

a = np.arange(10)

b = a[5]

print(b)

#slice items starting from index

import numpy as np

a = np.arange(10)

print(a[2:])

#slice items between indexes

import numpy as np

a = np.arange(10)

print(a[2:5])

#2D array Slicing

# 2D array Row and Column slicing


import numpy as np

a = np.array([[1,2,3],[3,4,5],[4,5,6]])

print('Our array is:\n')

print(a)

# this returns array of items in the second column

print('The items in the second column are:')

print(a[...,1])

print('\n')

# Now we will slice all items from the second row

print('The items in the second row are:')

print(a[1,...])

print('\n')

# Now we will slice all items from column 1 onwards

print('The items column 1 onwards are:')

print(a[...,1:])

print('The items row1 onwards are:')

print(a[1:,...])

#Integer Indexing

import numpy as np

x = np.array([[1, 2], [3, 4], [5, 6]])

y = x[[0,1,2], [0,1,0]]

print(y)

#corner elements from 2D array

import numpy as np

x = np.array([[ 0, 1, 2],[ 3, 4, 5],[ 6, 7, 8],[ 9, 10, 11]])

print('Our array is:\n')

print(x)
rows = np.array([[0,0],[3,3]])

cols = np.array([[0,2],[0,2]])

y = x[rows,cols]

print('The corner elements of this array are:\n')

print(y)

#Input for Slicing

import numpy as np

x = np.array([[ 0, 1, 2],[ 3, 4, 5],[ 6, 7, 8],[ 9, 10, 11]])

print('Our array is:\n')

print(x)

# slicing

z = x[1:4,1:3]

print('After slicing, our array becomes:')

print(z)

print('\n')

# using advanced index for column

import numpy as np

x = np.array([[ 0, 1, 2],[ 3, 4, 5],[ 6, 7, 8],[ 9, 10, 11]])

print('Our array is:\n')

print(x)

y = x[1:4,[1,2]]

print('Slicing using advanced index for column:\n')

print(y)

#Expression

import numpy as np

x = np.array([[ 0, 1, 2],[ 3, 4, 5],[ 6, 7, 8],[ 9, 10, 11]])


print('Our array is:\n')

print(x)

z1=x[:2, 1:]

#print(z1)

z2=x[2]

#print(z2)

z3=x[2, :]

#print(z3)

z4=x[2: , :]

#print(z4)

z5=x[:, :2]

#print(z5)

z6=x[1, :2]

#print(z6)

z7=x[1:2, :2]

print(z7)

#Multiple Index

arr = np.arange(32).reshape((8, 4))

print(arr)

print(arr[[1, 5, 7, 2], [0, 3, 1, 2]])

# Transposing Arrays and Swapping Axes:

arr = np.arange(15).reshape((3, 5))

print(arr)

print(arr.T)

arr = np.arange(16).reshape((2, 2, 4))

print(arr)
print(arr.transpose((1, 0, 2)))

#Boolean Array Indexing

import numpy as np

x = np.array([[ 0, 1, 2],[ 3, 4, 5],[ 6, 7, 8],[ 9, 10, 11]])

print('Our array is:')

print(x)

print('\n')

# Now we will print the items greater than 5

print('The items greater than 5 are:')

print(x[x > 5])

1.c. Perform computations on multi-dimensional array using universal functions

(ufunc).

#Power

import numpy as np

a1 = np.array([10, 20, 30, 40, 50, 60])

a2 = np.array([3, 5, 6, 8, 2, 33])

arrnew = np.power(a1, a2)

print(arrnew)

#Remainder

import numpy as np

a1 = np.array([10, 20, 30, 40, 50, 60])

a2 = np.array([3, 7, 9, 8, 2, 33])

arrnew = np.mod(a1, a2)

print(arrnew)

#Remainder
import numpy as np

a1 = np.array([10, 20, 30, 40, 50, 60])

a2 = np.array([3, 7, 9, 8, 2, 33])

arrnew = np.remainder(a1, a2)

print(arrnew)

#Quotient and Mod

import numpy as np

a1 = np.array([10, 20, 30, 40, 50, 60])

a2 = np.array([3, 7, 9, 8, 2, 33])

arrnew = np.divmod(a1, a2)

print(arrnew)

#Absolute Values

import numpy as np

a = np.array([-1, -2, 1, 2, 3, -4])

arr = np.absolute(a)

print(arr)

#Rounding Decimals

#Truncation

#Remove the decimals, and return the float number closest to zero. Use the trunc() and fix()
functions.

import numpy as np

arr = np.trunc([-3.1666, 3.6667])

print(arr)

#fix()

import numpy as np

arr = np.fix([-3.1666, 3.6667])

print(arr)
#Rounding

#The around() function increments preceding digit or decimal by 1 if >=0.5 else do nothing.

import numpy as np

arr = np.around(3.1666, 2)

print(arr)

#Floor

#The floor() function rounds off decimal to nearest lower integer.

import numpy as np

arr = np.floor([-3.1666, 3.6667])

print(arr)

#Ceil

#The ceil() function rounds off decimal to nearest upper integer.

import numpy as np

arr = np.ceil([-3.1666, 3.6667])

print(arr)

#Sine of an Angle (in radians)

#Calculate the sine of an array of angles (in radians) using NumPy

import numpy as np

angles = np.array([3.14, 3.14/2, 6.28])

sine_of_angles = np.sin(angles)

print('Sine of the given array of angles = ', sine_of_angles)

angles = np.array([3.14, 3.14/2, 6.28])

cosine_of_angles = np.cos(angles)

print('Cosine of the given array of angles = ', cosine_of_angles)


1.d). Compute arithmetic mean, standard deviation, variance, percentile, minimum and maximum,
cumulative sum and product using statistical functions in NumPy.

import numpy as np

a=np.array([[1,2,3], [4,5,6]])

print("mean of array a:",np.mean(a))

print("standard devition of a:",np.std(a))

print("variance of array a:",np.var(a))

print("percentile of array a:",np.percentile(a,50))

print("minimum of array a:",np.min(a))

print("maximum of array a:",np.max(a))

print("cumulative sum of array a:",np.cumsum(a))

print("cumulative product of array a:",np.cumprod(a))

#e. Perform set theory operations such as union, intersection, symmetric difference and fetching
unique values

# Program to perform different set operations

# sets are define

A = {0, 2, 4, 6, 8};

B = {1, 2, 3, 4, 5};

# union

print("Union :", A | B)

# intersection

print("Intersection :", A & B)

# difference

print("Difference :", A - B)

# symmetric difference
print("Symmetric difference :", A ^ B)

#2. Linear Algebra and Random Number generation using linalg and random module in NumPy a)
Compute dot product, vector product and inner product of two arrays.

# 2.a

import numpy as np

p=np.array([1,2,3])

q=np.array([4,7,8])

print(np.dot(p,q))

print(np.vdot(p,q))

print(np.inner(p,q))

#b) Perform matrix operations such as multiplication, determinant, sum of diagonal elements and
inverse.

import numpy as np

a=np.array([[1,2,3],[4,5,6]])

b=np.array([[2,3,4],[3,4,5]])

print("multiplication of two arrays:",np.multiply(a,b))

import numpy as np

r=np.array([[1,2,4], [5,4,3],[2,5,4]])

print("determinent of array r:",np.linalg.det(r))

print("sum of diagonal elements:",np.trace(a))

print("inverse of array a:",np.linalg.inv(r))

#c) Compute eigenvalues, eigenvectors and singular value decomposition for a square matrix.

import numpy as np

r=np.array([[1,2,4],[5,4,3],[2,5,4]])

c,d=np.linalg.eigh(r)

print("eigen values:",c)
print("eigen vector:",d)

print("singular value decomposition:",np.linalg.svd(r))

#3. a)Generate random samples from uniform, normal, binomial, chi-square and Gaussian
distributions using numpy.random functions

np.random.uniform(low = 2, high = 10, size = (3))

np.random.uniform(low = 2, high = 10, size = (2,4))

np.random.binomial(n=52, p=0.7, size = (2))

np.random.binomial(n=52, p=0.7, size = (2,3))

np.random.normal(0.0, 1.0, 5)

np.random.chisquare(3, 10)

#3 b). #b. Implement a single random walk with 1000 steps using random module and extract
the statistics like minimum and maximum value along the walk‘s trajectory.

import random

# Function to perform a single random walk with n steps

def random_walk(steps):

position = 0

walk = [position]

for _ in range(steps):

step = random.choice([-1, 1]) # Step is either -1 or 1

position += step

walk.append(position)

return walk

# Set number of steps

steps = 1000

# Perform the random walk

walk = random_walk(steps)
# Get the minimum and maximum values along the trajectory

min_value = min(walk)

max_value = max(walk)

# Output the results

print(f"Minimum value: {min_value}")

print(f"Maximum value: {max_value}")

4 #Data Manipulation using pandas

a. Create DataFrame from List, Dict, List of Dicts, Dicts of Series and perform operations such as
column selection, addition, deletion and row selection, addition and deletion.

import pandas as pd

# List of lists

data_list = [['Alice', 24], ['Bob', 27], ['Charlie', 22]]

df_list = pd.DataFrame(data_list, columns=['Name', 'Age'])

print("DataFrame from List:\n", df_list)

# Dictionary of lists

data_dict = {'Name': ['Alice', 'Bob', 'Charlie'], 'Age': [24, 27, 22]}

df_dict = pd.DataFrame(data_dict)

print("\nDataFrame from Dictionary:\n", df_dict)

# List of dictionaries

data_list_dicts = [{'Name': 'Alice', 'Age': 24}, {'Name': 'Bob', 'Age': 27}, {'Name': 'Charlie', 'Age': 22}]

df_list_dicts = pd.DataFrame(data_list_dicts)

print("\nDataFrame from List of Dictionaries:\n", df_list_dicts)


# Dictionary of Series

data_dict_series = {'Name': pd.Series(['Alice', 'Bob', 'Charlie']), 'Age': pd.Series([24, 27, 22])}

df_dict_series = pd.DataFrame(data_dict_series)

print("\nDataFrame from Dictionary of Series:\n", df_dict_series)

# Selecting the 'Name' column

name_column = df_list['Name']

print("\nSelected 'Name' Column:\n", name_column)

# Adding a new column 'Score'

df_list['Score'] = [85, 90, 78]

print("\nDataFrame after adding 'Score' column:\n", df_list)

# Deleting the 'Age' column

df_list = df_list.drop('Age', axis=1)

print("\nDataFrame after deleting 'Age' column:\n", df_list)

# Selecting the second row (index 1)

second_row = df_dict.iloc[1]

print("\nSelected Second Row:\n", second_row)

4.b. Create a DataFrame and perform descriptive statistics functions such as sum, mean, median,
mode, standard deviation, skewness, kurtosis, cumulative sum, cumulative product and percent
changes.

import pandas as pd

import numpy as np
# Creating a DataFrame with sample data

data = {

'A': [10, 15, 20, 25, 30],

'B': [1, 2, 3, 4, 5],

'C': [100, 150, 200, 250, 300]

df = pd.DataFrame(data)

print("DataFrame:\n", df)

# Sum of all columns

sum_result = df.sum()

print("\nSum:\n", sum_result)

# Mean of all columns

mean_result = df.mean()

print("\nMean:\n", mean_result)

# Median of all columns

median_result = df.median()

print("\nMedian:\n", median_result)

# Mode of all columns

mode_result = df.mode()

print("\nMode:\n", mode_result)

# Standard Deviation of all columns


std_result = df.std()

print("\nStandard Deviation:\n", std_result)

# Skewness of all columns

skew_result = df.skew()

print("\nSkewness:\n", skew_result)

# Kurtosis of all columns

kurtosis_result = df.kurt()

print("\nKurtosis:\n", kurtosis_result)

# Cumulative Sum of all columns

cum_sum_result = df.cumsum()

print("\nCumulative Sum:\n", cum_sum_result)

# Cumulative Product of all columns

cum_prod_result = df.cumprod()

print("\nCumulative Product:\n", cum_prod_result)

# Percent Change of all columns

pct_change_result = df.pct_change()

print("\nPercent Change:\n", pct_change_result)

4.c. Implement the computation of correlation and covariance by considering the DataFrames of
stock prices and volumes obtained from Yahoo Finance! Using pandas-datareader package.
pip install pandas-datareader pandas numpy

import pandas as pd

import pandas_datareader.data as web

import numpy as np

# Define the stock symbols and the time period

symbols = ['AAPL', 'MSFT']

start_date = '2023-01-01'

end_date = '2024-01-01'

# Fetch stock price and volume data

def fetch_data(symbol, start, end):

return web.DataReader(symbol, 'yahoo', start, end)

aapl_data = fetch_data('AAPL', start_date, end_date)

msft_data = fetch_data('MSFT', start_date, end_date)

aapl_prices = aapl_data['Adj Close']

aapl_volumes = aapl_data['Volume']

msft_prices = msft_data['Adj Close']

msft_volumes = msft_data['Volume']

combined_df = pd.DataFrame({

'AAPL_Price': aapl_prices,

'AAPL_Volume': aapl_volumes,

'MSFT_Price': msft_prices,

'MSFT_Volume': msft_volumes

}).dropna() # Drop rows with any missing values


# Compute correlation

correlation = combined_df.corr()

# Compute covariance

covariance = combined_df.cov()

print("Correlation matrix:")

print(correlation)

print("\nCovariance matrix:")

print(covariance)

#Interacting with Web APIs and Databases a) Predict the last 30 GitHub issues for pandas using
request and response object’s json method. Move the extracted data to DataFrame and extract elds
of interest. (Use url: https://api.github.com/repos/pandas- dev/pandas/issues)

import requests

import pandas as pd

resp=requests.get('https://reqres.in/api/users')

resp_dict=resp.json() #print(resp_dict)

df=pd.DataFrame(resp_dict.get('data'))

print(df)

#Interacting with Web APIs and Databases a) Predict the last 30 GitHub issues for pandas using
request and response object’s json method. Move the extracted data to DataFrame and extract elds
of interest. (Use url: https://api.github.com/repos/pandas- dev/pandas/issues)

import requests

import pandas as pd

resp=requests.get('https://reqres.in/api/users')

resp_dict=resp.json() #print(resp_dict)
df=pd.DataFrame(resp_dict.get('data'))

print(df)

Let's break down how to work with different data formats using `pandas`. We'll cover CSV, JSON, and
Excel file formats.

### a. Reading and Writing Data in Text Format (CSV)

**Reading CSV Files:**

To read a CSV file from an online source, you can use the `pd.read_csv()` function. For example, let's
use a publicly available dataset:

```python

import pandas as pd

# URL of a CSV file (example dataset)

csv_url = 'https://people.sc.fsu.edu/~jburkardt/data/csv/hw_200.csv'

# Read the CSV file into a DataFrame

df = pd.read_csv(csv_url)

# Display the first few rows of the DataFrame

print(df.head())

```

**Writing CSV Files:**

You can save a DataFrame to a CSV file using the `to_csv()` method:

```python
# Save DataFrame to a CSV file

df.to_csv('output_file.csv', index=False) # index=False avoids writing row indices to the file

```

### b. Reading, Writing, and Parsing Data in JSON Format

**Reading JSON Files:**

To read JSON data into a DataFrame, use `pd.read_json()`. You can read JSON from a file or a URL:

```python

# URL of a JSON file (example dataset)

json_url = 'https://api.github.com/users/octocat'

# Read the JSON file into a DataFrame

df_json = pd.read_json(json_url)

# Display the DataFrame

print(df_json)

```

**Writing JSON Files:**

You can write a DataFrame to a JSON file using the `to_json()` method:

```python

# Save DataFrame to a JSON file

df_json.to_json('output_file.json', orient='records', lines=True) # 'records' format is common for


JSON

```
**Parsing JSON Strings:**

If you have a JSON string, you can convert it to a DataFrame:

```python

import json

# Example JSON string

json_string = '''

{"name": "Alice", "age": 30, "city": "New York"},

{"name": "Bob", "age": 25, "city": "Los Angeles"}

'''

# Convert JSON string to DataFrame

data = json.loads(json_string)

df_json_from_string = pd.DataFrame(data)

# Display the DataFrame

print(df_json_from_string)

```

### c. Reading and Writing Microsoft Excel Files (XLSX)

**Reading Excel Files:**

To read Excel files, use `pd.read_excel()`. You need the `openpyxl` or `xlrd` library to read `.xlsx` or
`.xls` files, respectively:

```python
# URL of an Excel file (example dataset)

excel_url = 'https://people.sc.fsu.edu/~jburkardt/data/xlsx/airtravel.xlsx'

# Read the Excel file into a DataFrame

df_excel = pd.read_excel(excel_url)

# Display the first few rows of the DataFrame

print(df_excel.head())

```

**Writing Excel Files:**

To write a DataFrame to an Excel file, use the `to_excel()` method:

```python

# Save DataFrame to an Excel file

df_excel.to_excel('output_file.xlsx', index=False) # index=False avoids writing row indices to the file

```

For more advanced usage, you can write multiple DataFrames to different sheets in the same Excel
file using `ExcelWriter`:

```python

with pd.ExcelWriter('output_file.xlsx') as writer:

df_excel.to_excel(writer, sheet_name='Sheet1', index=False)

df_json_from_string.to_excel(writer, sheet_name='Sheet2', index=False)

```

Data Cleaning and Transformation Preparation


a. Perform data cleaning by creating a DataFrame and identifying missing data using NA(Not
Available) handling methods, filter out missing data using dropna function, fill the missing data using
fillna function and remove duplicates using duplicated and drop_duplicates functions.

Data cleaning and transformation are crucial steps in preparing your dataset for analysis. Here’s how
you can perform these tasks using `pandas` in Python:

### a. Data Cleaning Preparation

start by creating a sample DataFrame and then go through the processes of identifying missing data,
handling missing values, and removing duplicates.

#### **1. Creating a Sample DataFrame**

```python

import pandas as pd

import numpy as np

# Create a sample DataFrame

data = {

'A': [1, 2, np.nan, 4, 5, 6, np.nan],

'B': ['a', 'b', 'c', np.nan, 'e', 'f', 'g'],

'C': [np.nan, 2.5, 3.6, 4.7, np.nan, 5.9, 6.1]

df = pd.DataFrame(data)

print("Original DataFrame:")

print(df)

```
#### **2. Identifying Missing Data**

To identify missing data, use `isna()` or `isnull()` functions:

```python

# Identify missing data

missing_data = df.isna()

print("\nMissing Data:")

print(missing_data)

```

#### **3. Handling Missing Data**

**a. Dropping Missing Data**

You can drop rows or columns with missing values using `dropna()`:

```python

# Drop rows with any missing values

df_dropped_rows = df.dropna()

# Drop columns with any missing values

df_dropped_columns = df.dropna(axis=1)

print("\nDataFrame after dropping rows with missing values:")

print(df_dropped_rows)

print("\nDataFrame after dropping columns with missing values:")

print(df_dropped_columns)

```
**b. Filling Missing Data**

You can fill missing data using `fillna()`:

```python

# Fill missing values with a specific value

df_filled_value = df.fillna(value=0)

# Fill missing values with the mean of each column

df_filled_mean = df.fillna(df.mean())

print("\nDataFrame after filling missing values with 0:")

print(df_filled_value)

print("\nDataFrame after filling missing values with column means:")

print(df_filled_mean)

```

**c. Forward and Backward Fill**

You can also use forward fill (`ffill`) and backward fill (`bfill`) methods:

```python

# Forward fill missing values

df_filled_ffill = df.fillna(method='ffill')

# Backward fill missing values

df_filled_bfill = df.fillna(method='bfill')

print("\nDataFrame after forward fill:")


print(df_filled_ffill)

print("\nDataFrame after backward fill:")

print(df_filled_bfill)

```

#### **4. Removing Duplicates**

To find and remove duplicate rows, use `duplicated()` and `drop_duplicates()`:

```python

# Find duplicate rows

duplicates = df.duplicated()

print("\nDuplicate Rows:")

print(duplicates)

# Remove duplicate rows

df_no_duplicates = df.drop_duplicates()

print("\nDataFrame after removing duplicates:")

print(df_no_duplicates)

```

### about Functions

- **Identifying Missing Data**: `isna()`, `isnull()`

- **Dropping Missing Data**: `dropna()`

- Drop rows with missing values: `df.dropna()`

- Drop columns with missing values: `df.dropna(axis=1)`

- **Filling Missing Data**: `fillna()`

- Fill with a specific value: `df.fillna(value)`


- Fill with column means or other statistics: `df.fillna(df.mean())`

- Forward fill: `df.fillna(method='ffill')`

- Backward fill: `df.fillna(method='bfill')`

- **Removing Duplicates**: `duplicated()`, `drop_duplicates()`

Data transformation in pandas involves modifying values, renaming columns or indices, and ensuring
that the original dataset remains unchanged when necessary. Here's how you can perform these
transformations:

### **1. Modifying Values Using `map` and `replace`**

#### **a. Using `map`**

The `map()` method is useful for element-wise transformations in a DataFrame or Series. For
example, you might want to transform categorical values or apply a function to each element.

```python

import pandas as pd

# Create a sample DataFrame

data = {

'A': [1, 2, 3, 4, 5],

'B': ['apple', 'banana', 'cherry', 'date', 'elderberry']

df = pd.DataFrame(data)

# Define a mapping function

def transform_value(x):

return x * 10 if isinstance(x, int) else x.upper()

# Apply the map function to column 'A'


df['A_transformed'] = df['A'].map(transform_value)

# Apply the map function to column 'B'

df['B_transformed'] = df['B'].map(str.upper)

print("DataFrame after applying map function:")

print(df)

```

#### **b. Using `replace`**

The `replace()` method is useful for replacing specific values with new values.

```python

# Replace specific values in column 'B'

df_replaced = df.replace({'apple': 'APPLE', 'banana': 'BANANA'})

print("\nDataFrame after applying replace method:")

print(df_replaced)

```

### **2. Creating a Transformed Version Without Modifying the Original Dataset Using `rename`**

The `rename()` method allows you to change column or index names without modifying the original
dataset.

```python

# Create a transformed version of the DataFrame with renamed columns

df_renamed = df.rename(columns={'A': 'Column_A', 'B': 'Column_B'})

print("\nOriginal DataFrame:")
print(df)

print("\nTransformed DataFrame with renamed columns:")

print(df_renamed)

```

Data Preparation

a. Create a DataFrame with normally distributed data using random sampling and detect
possible outliers.

To create a DataFrame with normally distributed data and detect possible outliers, follow these
steps:

### **1. Create a DataFrame with Normally Distributed Data**

We will use `numpy` to generate normally distributed data and `pandas` to create the DataFrame.

```python

import pandas as pd

import numpy as np

# Set random seed for reproducibility

np.random.seed(42)

# Define parameters for normally distributed data


num_samples = 100

mean = 0

std_dev = 1

# Generate normally distributed data

data = {

'Feature1': np.random.normal(mean, std_dev, num_samples),

'Feature2': np.random.normal(mean, std_dev, num_samples)

# Create DataFrame

df = pd.DataFrame(data)

print("DataFrame with Normally Distributed Data:")

print(df.head())

```

### **2. Detect Possible Outliers**

There are several methods to detect outliers. We'll use the following methods:

- **Z-Score Method**: This method uses the standard deviation and mean to identify outliers.

- **Interquartile Range (IQR) Method**: This method identifies outliers based on the IQR, which is
the range between the first and third quartiles.

#### **a. Z-Score Method**

The Z-score measures how many standard deviations a data point is from the mean. Typically, data
points with Z-scores greater than 3 or less than -3 are considered outliers.

```python

from scipy import stats


# Compute Z-scores

z_scores = np.abs(stats.zscore(df))

# Identify outliers using a threshold (e.g., Z-score > 3)

outliers_z_score = (z_scores > 3)

# Display rows with outliers

outlier_indices = np.where(outliers_z_score.any(axis=1))[0]

print("\nOutliers detected using Z-score method:")

print(df.iloc[outlier_indices])

```

#### **b. Interquartile Range (IQR) Method**

The IQR method identifies outliers by calculating the range between the first quartile (Q1) and the
third quartile (Q3). Values outside 1.5 times the IQR from Q1 and Q3 are considered outliers.

```python

# Calculate Q1 (25th percentile) and Q3 (75th percentile) for each feature

Q1 = df.quantile(0.25)

Q3 = df.quantile(0.75)

# Calculate IQR

IQR = Q3 - Q1

# Determine outliers

outliers_iqr = ((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR)))

# Display rows with outliers

outlier_indices_iqr = df[outliers_iqr.any(axis=1)].index
print("\nOutliers detected using IQR method:")

print(df.loc[outlier_indices_iqr])

```

b. Perform text manipulation with regular expression by applying relevant regular expression
methods to split a string with a variable number of whitespace characters (tabs, spaces, and
newlines) and get a list of all patterns matching

Text manipulation with regular expressions (regex) in Python is powerful for extracting, splitting, and
finding patterns in strings. To handle splitting a string with variable whitespace characters and
finding all matching patterns, you can use the `re` module.

Here's a step-by-step guide:

### **1. Import the `re` Module**

First, import the `re` module which provides support for regular expressions.

```python

import re

```

### **2. Define a Sample String**

Create a string with variable whitespace characters including tabs, spaces, and newlines.

```python

text = """
This is a sample text

with various whitespace

characters.

Here\tis\tanother line

with \nmultiple \nnewlines

"""

```

### **3. Split the String Using Regular Expressions**

To split the string by any whitespace character (spaces, tabs, and newlines), use the `re.split()`
method with the regular expression `\s+` which matches one or more whitespace characters.

```python

# Split the string by any whitespace character

split_text = re.split(r'\s+', text)

print("Split text:")

print(split_text)

```

### **4. Find All Patterns Matching a Regular Expression**

If you want to find all occurrences of a specific pattern in the string, use the `re.findall()` method. For
example, to find all words (sequences of non-whitespace characters), you can use the regular
expression `\S+`.

```python

# Find all words in the text

words = re.findall(r'\S+', text)

print("\nList of all words:")


print(words)

```

### **Example Code**

```python

import re

# Define a sample string

text = """

This is a sample text

with various whitespace

characters.

Here\tis\tanother line

with \nmultiple \nnewlines

"""

# Split the string by any whitespace character

split_text = re.split(r'\s+', text)

print("Split text:")

print(split_text)

# Find all words (non-whitespace sequences)

words = re.findall(r'\S+', text)

print("\nList of all words:")

print(words)

```
Let's go through each of the data visualization tasks using Matplotlib and Seaborn. We'll use a
sample online dataset from `seaborn` for demonstration purposes.

### **a. Create a Line Plot**

To create a line plot with Matplotlib, including setting titles, axis labels, ticks, tick labels, and
annotations, follow these steps:

```python

import matplotlib.pyplot as plt

import seaborn as sns

# Load a sample dataset from seaborn

df = sns.load_dataset('flights')

# Pivot the dataset to get the data in the right format for line plot

pivot_df = df.pivot('month', 'year', 'passengers')

# Create a line plot

plt.figure(figsize=(12, 6))

sns.lineplot(data=pivot_df, dashes=False)

# Set title and axis labels

plt.title('Number of Passengers Over Time')

plt.xlabel('Year')

plt.ylabel('Number of Passengers')

# Set ticks and tick labels

plt.xticks(rotation=45)

plt.yticks(range(0, 800, 100))

# Annotate a point (example: annotate the point for the year 1950, month January)
plt.annotate('1950 Jan', xy=(0, pivot_df.loc['January', 1950]), xytext=(0, 600),

arrowprops=dict(facecolor='black', shrink=0.05))

# Save the plot to a file

plt.savefig('line_plot.png')

plt.show()

```

### **b. Create Bar Plots**

**i. Create Bar Plots Using a DataFrame**

```python

# Create a sample DataFrame

df = pd.DataFrame({

'A': [3, 4, 2],

'B': [7, 6, 8],

'C': [5, 3, 7]

}, index=['X', 'Y', 'Z'])

# Create bar plots

df.plot(kind='bar', figsize=(10, 6))

# Set title and labels

plt.title('Bar Plot for DataFrame Rows')

plt.xlabel('Index')

plt.ylabel('Values')

plt.savefig('bar_plot.png')

plt.show()
```

**ii. Create Stacked Bar Plots**

```python

# Create stacked bar plots

df.plot(kind='bar', stacked=True, figsize=(10, 6))

# Set title and labels

plt.title('Stacked Bar Plot for DataFrame Rows')

plt.xlabel('Index')

plt.ylabel('Values')

plt.savefig('stacked_bar_plot.png')

plt.show()

```

### **c. Create Histogram and Density Plot**

```python

# Load a sample dataset

df = sns.load_dataset('iris')

# Create a histogram

plt.figure(figsize=(12, 6))

sns.histplot(df['sepal_length'], kde=False, bins=20)

# Set title and labels

plt.title('Histogram of Sepal Length')

plt.xlabel('Sepal Length')

plt.ylabel('Frequency')
plt.savefig('histogram.png')

plt.show()

# Create a density plot

plt.figure(figsize=(12, 6))

sns.kdeplot(df['sepal_length'], shade=True)

# Set title and labels

plt.title('Density Plot of Sepal Length')

plt.xlabel('Sepal Length')

plt.ylabel('Density')

plt.savefig('density_plot.png')

plt.show()

```

### **d. Create a Scatter Plot**

```python

# Load a sample dataset

df = sns.load_dataset('penguins')

# Create a scatter plot

plt.figure(figsize=(12, 6))

sns.scatterplot(x='flipper_length_mm', y='body_mass_g', data=df)

# Set title and labels

plt.title('Scatter Plot of Flipper Length vs Body Mass')

plt.xlabel('Flipper Length (mm)')

plt.ylabel('Body Mass (g)')


plt.savefig('scatter_plot.png')

plt.show()

```

### **e. Create Box Plots**

```python

# Load a sample dataset

df = sns.load_dataset('tips')

# Create a box plot

plt.figure(figsize=(12, 6))

sns.boxplot(x='day', y='total_bill', data=df)

# Set title and labels

plt.title('Box Plot of Total Bill by Day')

plt.xlabel('Day')

plt.ylabel('Total Bill')

plt.savefig('box_plot.png')

plt.show()

```

- **Line Plot**: Use `sns.lineplot()` and `plt.annotate()` to annotate and customize.

- **Bar Plots**: Use `df.plot(kind='bar')` for grouped bars and `df.plot(kind='bar', stacked=True)` for
stacked bars.

- **Histogram and Density Plot**: Use `sns.histplot()` for histograms and `sns.kdeplot()` for density
plots.

- **Scatter Plot**: Use `sns.scatterplot()` to visualize the relationship between two variables.
- **Box Plot**: Use `sns.boxplot()` to visualize data distribution and outliers.

#9.

Time Series Analysis

a. Create time series using datetime object in pandas indexed by timestamps.

b. Use pandas.date_range to generate a DatetimeIndex with an indicated length.

c. Generate data ranges by setting time zone, localize time zone and convert to particular time
zone using tz_convert and combine two different time zones.

d. Perform period arithmetic such as adding and subtracting integers from periods and
construct range of periods using period_range function.

e. Convert Periods and PeriodIndex objects to another frequency with asfreq method.

f. Convert Series and DataFrame objects indexed by timestamps to periods with the to_period
method.

g. Perform resampling, downsampling and upsampling for the time series.

Here's how you can perform various time series analyses using pandas:

### **a. Create Time Series Using Datetime Object Indexed by Timestamps**

To create a time series with a datetime index, you can use `pd.date_range` and then create a
DataFrame or Series.

```python
import pandas as pd

import numpy as np

# Create a time series with datetime index

date_range = pd.date_range(start='2024-01-01', periods=10, freq='D')

data = np.random.randn(len(date_range))

# Create a DataFrame with datetime index

df = pd.DataFrame(data, index=date_range, columns=['Value'])

print("Time Series DataFrame:")

print(df)

```

### **b. Generate DatetimeIndex with `pd.date_range`**

You can use `pd.date_range` to generate a `DatetimeIndex` with a specified length and frequency.

```python

# Generate a DatetimeIndex with an indicated length

datetime_index = pd.date_range(start='2024-01-01', periods=12, freq='M') # Monthly frequency

print("Generated DatetimeIndex:")

print(datetime_index)

```

### **c. Generate Data Ranges with Time Zones**

You can set, localize, and convert time zones using pandas.

```python
# Create a timezone-naive datetime index

naive_dates = pd.date_range(start='2024-01-01', periods=3, freq='D')

# Localize to a specific time zone (e.g., 'UTC')

localized_dates = naive_dates.tz_localize('UTC')

# Convert to another time zone (e.g., 'America/New_York')

converted_dates = localized_dates.tz_convert('America/New_York')

print("Localized Dates (UTC):")

print(localized_dates)

print("\nConverted Dates (America/New_York):")

print(converted_dates)

# Combine two different time zones into a DataFrame

df_timezone = pd.DataFrame({

'UTC': localized_dates,

'New_York': converted_dates

})

print("\nDataFrame with Different Time Zones:")

print(df_timezone)

```

### **d. Perform Period Arithmetic and Construct Range of Periods**

Period arithmetic can be performed by adding or subtracting periods, and you can use
`pd.period_range` to create ranges of periods.

```python
# Create a period range

period_range = pd.period_range(start='2024-01', periods=6, freq='M')

print("Period Range:")

print(period_range)

# Add and subtract periods

period = pd.Period('2024-01', freq='M')

new_period = period + 2 # Adding 2 months

previous_period = period - 1 # Subtracting 1 month

print("\nPeriod Arithmetic:")

print(f"Original Period: {period}")

print(f"Period after adding 2 months: {new_period}")

print(f"Period after subtracting 1 month: {previous_period}")

```

### **e. Convert Periods and PeriodIndex Objects to Another Frequency**

You can convert periods and `PeriodIndex` objects to another frequency using the `asfreq` method.

```python

# Create a PeriodIndex

period_index = pd.period_range(start='2024-01', periods=6, freq='M')

# Convert to a different frequency (e.g., quarterly)

quarterly_index = period_index.to_timestamp().to_period('Q')

print("\nQuarterly PeriodIndex:")

print(quarterly_index)

```
### **f. Convert Series and DataFrame Indexed by Timestamps to Periods**

Convert a DataFrame or Series with a timestamp index to a period index.

```python

# Create a DataFrame with a timestamp index

timestamp_df = pd.DataFrame(data, index=pd.date_range(start='2024-01-01', periods=10, freq='D'))

# Convert to periods (e.g., monthly periods)

period_df = timestamp_df.to_period('M')

print("\nDataFrame with Timestamp Index:")

print(timestamp_df)

print("\nDataFrame with Period Index (Monthly):")

print(period_df)

```

### **g. Perform Resampling, Downsampling, and Upsampling**

Resampling, downsampling, and upsampling can be done using the `resample` method.

```python

# Create a time series with daily frequency

date_range = pd.date_range(start='2024-01-01', periods=30, freq='D')

data = np.random.randn(len(date_range))

ts = pd.Series(data, index=date_range)

# Resample to monthly frequency (downsampling)

monthly_ts = ts.resample('M').mean()
# Upsample to hourly frequency

hourly_ts = ts.resample('H').asfreq()

print("\nOriginal Time Series:")

print(ts.head())

print("\nMonthly Resampled Time Series:")

print(monthly_ts.head())

print("\nHourly Upsampled Time Series:")

print(hourly_ts.head())

```

- **Time Series Creation**: Use `pd.date_range` to create a time series with a datetime index.

- **DatetimeIndex Generation**: Generate `DatetimeIndex` with `pd.date_range`.

- **Time Zones**: Localize, convert, and combine time zones using `tz_localize` and `tz_convert`.

- **Period Arithmetic**: Add or subtract periods and create period ranges with `pd.period_range`.

- **Frequency Conversion**: Convert periods and `PeriodIndex` objects to different frequencies


using `asfreq`.

- **Index Conversion**: Convert timestamp-indexed DataFrames/Series to periods with `to_period`.

- **Resampling**: Use `resample` for downsampling and upsampling time series data.

Let's dive into data aggregation with pandas. We’ll work with creating a DataFrame, grouping by
single and multiple keys, computing summary statistics, and performing exploratory data analysis.

### **a. Create a Tabular Dataset and Split Data into Groups**
**Creating a DataFrame:**

```python

import pandas as pd

import numpy as np

# Create a sample DataFrame

data = {

'Category': ['A', 'B', 'A', 'B', 'A', 'B', 'C', 'C', 'A', 'B'],

'Subcategory': ['X', 'Y', 'X', 'Y', 'Z', 'X', 'Y', 'Z', 'Z', 'X'],

'Value': np.random.randint(1, 100, 10)

df = pd.DataFrame(data)

print("Original DataFrame:")

print(df)

```

**Group by a Single Key:**

```python

# Group by a single column (e.g., 'Category')

grouped_single = df.groupby('Category')

print("\nGrouped by 'Category':")

for name, group in grouped_single:

print(f"\nGroup '{name}':")

print(group)

```
**Group by Multiple Keys:**

```python

# Group by multiple columns (e.g., 'Category' and 'Subcategory')

grouped_multiple = df.groupby(['Category', 'Subcategory'])

print("\nGrouped by 'Category' and 'Subcategory':")

for name, group in grouped_multiple:

print(f"\nGroup {name}:")

print(group)

```

### **b. Compute Summary Statistics for Grouped Data**

**Compute Aggregates:**

```python

# Compute summary statistics (sum, mean, std) for the grouped data

aggregated_data = grouped_multiple['Value'].agg(['sum', 'mean', 'std'])

print("\nAggregated Data (Sum, Mean, Std):")

print(aggregated_data)

```

### **c. Use `groupby` for Advanced Data Aggregation**

We'll use an online dataset for this example. Let's use the `seaborn` library's built-in dataset "tips".

```python

import seaborn as sns


# Load the 'tips' dataset

df_tips = sns.load_dataset('tips')

# Display the first few rows

print("\nSample 'tips' dataset:")

print(df_tips.head())

```

**Group by One Column:**

```python

# Group by 'day' and compute summary statistics

grouped_day = df_tips.groupby('day').agg({

'total_bill': ['sum', 'mean', 'std'],

'tip': ['sum', 'mean', 'std'],

'size': 'mean' # Compute the mean size

})

print("\nAggregated Data by 'day':")

print(grouped_day)

```

**Group by Multiple Columns:**

```python

# Group by 'day' and 'time', and compute summary statistics

grouped_day_time = df_tips.groupby(['day', 'time']).agg({

'total_bill': ['sum', 'mean', 'std'],

'tip': ['sum', 'mean', 'std'],

'size': 'mean'

})
print("\nAggregated Data by 'day' and 'time':")

print(grouped_day_time)

```

**Perform Exploratory Data Analysis (EDA):**

Let's compute additional summary statistics and visualizations.

```python

# Compute basic statistics

basic_stats = df_tips.describe(include='all')

print("\nBasic Statistics for 'tips' dataset:")

print(basic_stats)

# Visualize total bill by day

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))

sns.barplot(x='day', y='total_bill', data=df_tips)

plt.title('Total Bill by Day')

plt.show()

```

- **Creating DataFrame**: Use pandas DataFrame to organize data.

- **Grouping**:

- **Single Key**: Use `groupby('column_name')`.

- **Multiple Keys**: Use `groupby(['col1', 'col2'])`.


- **Aggregation**: Compute summary statistics using `agg()` with functions like `sum`, `mean`, `std`.

- **Exploratory Data Analysis**:

- **Single and Multiple Columns**: Perform grouping and aggregation.

- **Visualizations**: Use libraries like Matplotlib and Seaborn for plotting.

You might also like