Working with Directories and File Paths:
The os and os.path modules provide functions for working with directories and file paths.
Sample program to list files in a directory and get file paths:
import os
# List all files in a directory
files = os.listdir("/path/to/directory")
for file in files:
print(file)
# Get the absolute path of a file
file_path = os.path.abspath("sample.txt")
print(file_path)
# Check if a path exists and if it's a directory or file
if os.path.exists(file_path):
if os.path.isfile(file_path):
print("It's a file.")
elif os.path.isdir(file_path):
print("It's a directory.")
These advanced file handling functions and techniques can help you work with different file formats, manipulate directories, and efficiently handle various file-related tasks in Python
# Arithmatic Operations in Array Objects
# Create two matrices
matrix1 = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
matrix2 = np.array([[9, 8, 7], [6, 5, 4], [3, 2, 1]])
# Matrix Addition
addition_result = np.add(matrix1, matrix2)
# Matrix Subtraction
subtraction_result = np.subtract(matrix1, matrix2)
# Matrix Multiplication
multiplication_result = np.dot(matrix1, matrix2)
# Display the results
print("Matrix 1:")
print(matrix1)
print("\nMatrix 2:")
print(matrix2)
print("\nMatrix Addition:")
print(addition_result)
print("\nMatrix Subtraction:")
print(subtraction_result)
print("\nMatrix Multiplication:")
print(multiplication_result)
# Create two matrices
matrix1 = np.array([[1, 2], [3, 4]])
matrix2 = np.array([[5, 6], [7, 8]])
# Perform matrix addition
addition_result = matrix1 + matrix2
# Perform matrix subtraction
subtraction_result = matrix1 - matrix2
# Perform matrix multiplication (element-wise)
elementwise_multiplication_result = matrix1 * matrix2
# Perform matrix division (element-wise)
elementwise_division_result = matrix1 / matrix2
# Display the results
print("Matrix1:")
print(matrix1)
print("\nMatrix2:")
print(matrix2)
print("\nMatrix Addition:")
print(addition_result)
print("\nMatrix Subtraction:")
print(subtraction_result)
print("\nElement-wise Matrix Multiplication:")
print(elementwise_multiplication_result)
print("\nElement-wise Matrix Division:")
print(elementwise_division_result)
arr2 = np.arange(10,19)
reshaped_array = arr2.reshape(3,3)
print("Martrix\n",reshaped_array)
arrindex_value = arr2[np.array([[0],[2],[6],[8]])]
print("Fetched Matrix Value using indexing\n",arrindex_value)
arr2 = np.arange(0,27)
reshaped_array = arr2.reshape(3,3,3)
print("3 Matices\n",reshaped_array)
print("\nFetched single value",reshaped_array[1,1,1])
print("Access row of single value ",reshaped_array[1,1])
print("Matrix acess of single value\n",reshaped_array[1])
# Descriptive Analysis
import numpy as np
import scipy.stats as stats
# Given students weight
weight = [65, 70, 83, 88, 90, 90, 71, 85, 79, 95]
# Mean
mean = np.mean(weight)
print(f"Mean: {mean}")
# Median
median = np.median(weight)
print(f"Median: {median}")
# Mode
mode_var = np.mode(weight)
print(f"Mode: {mode_var[0]} (appears {mode_var.count[0]} times)")
# Standard Deviation
std_dev = round(np.std(weight, ddof=1))
print(f"Standard Deviation: {std_dev}")
# Variance
variance = round(np.var(weight, ddof=1))
print(f"Variance: {variance}")
# Range
data_range = np.max(weight) - np.min(weight)
print(f"Range: {data_range}")
# Interquartile Range (IQR)
q1 = np.percentile(weight, 25)
print(f"First Quartile (Q1): {q1}")
q3 = np.percentile(weight, 75)
print(f"Third Quartile (Q3): {q3}")
iqr = q3 - q1
print(f"Interquartile Range (IQR): {iqr}")
# Percentiles
percentiles = np.percentile(weight, [25, 50, 75])
print(f"25th Percentile: {percentiles[0]}")
print(f"50th Percentile: {percentiles[1]}")
print(f"75th Percentile: {percentiles[2]}")
In NumPy functions like np.std() and np.var(), the ddof parameter stands for "Delta Degrees of Freedom".
Calculation: When calculating the variance or standard deviation, the denominator used is N - ddof, where N is the number of elements in the array.
Default Value: The default value for ddof in NumPy is 0.
Unbiased Estimator: Setting ddof=1 provides an unbiased estimator of the population variance and standard deviation, assuming the sample is drawn from a larger population.
Why is ddof=1 important?
When calculating the variance or standard deviation of a sample, using ddof=0 (the default) can lead to a biased estimate of the population variance. This is because the sample variance tends to underestimate the true population
import numpy as np
# Create a sample dataset
data = np.array([2, 3, 5, 7, 10, 12, 15])
# Calculate the mean (average) of the dataset
mean = np.mean(data)
# Calculate the standard deviation of the dataset
std_dev = np.std(data)
# Calculate the standard deviation of the dataset
std_dev = np.median(data)
# Calculate the Z-scores for each data point
z_scores = (data - mean) / std_dev
# Display the Z-scores
print("Data:")
print(data)
print("\nZ-Scores:")
print(z_scores)
import pandas as pd
# Creating two Series
series1 = pd.Series([1, 2, 3, 4, 5])
series2 = pd.Series([10, 20, 30, 40, 50])
# Addition
result_add = series1 + series2
# Subtraction
result_sub = series2 - series1
# Multiplication
result_mul = series1 * series2
# Division
result_div = series2 / series1
print("Addition:")
print(result_add)
print("\nSubtraction:")
print(result_sub)
print("\nMultiplication:")
print(result_mul)
print("\nDivision:")
print(result_div)
import pandas as pd
# Creating two Series
series1 = pd.Series([1, 2, 3, 4, 5])
series2 = pd.Series([10, 20, 30, 40, 50])
# Addition
result_add = series1 + series2
# Subtraction
result_sub = series2 - series1
# Multiplication
result_mul = series1 * series2
# Division
result_div = series2 / series1
print("Addition:")
print(result_add)
print("\nSubtraction:")
print(result_sub)
print("\nMultiplication:")
print(result_mul)
print("\nDivision:")
print(result_div)
import pandas as pd
import numpy as np
# Creating a Series
series = pd.Series([1, 4, 9, 16, 25])
# Calculate the square root
result_sqrt = np.sqrt(series)
# Apply a custom function
def custom_function(x):
return x * 2
result_custom = series.apply(custom_function)
print("Square Root:")
print(result_sqrt)
print("\nCustom Function:")
print(result_custom)
import pandas as pd
# Creating a Series
series = pd.Series([45, 80, 30, 40, 50, 25, 65, 90, 85, 92])
# Conditional filtering
filtered_series = series[series > 50]
print("\nConditional Filtering:")
print(filtered_series)
import pandas as pd
# Create multiple Series
series1 = pd.Series([178, 180, 165, 156, 189], name='Height')
series2 = pd.Series([80, 90, 70, 60, 85], name='Weight')
series3 = pd.Series([10.1, 20.2, 30.3, 40.4, 50.5], name='BMI')
# Display the individual Series
print(series1)
print(series2)
print(series3)
# Create a DataFrame from the Series
df = pd.DataFrame({'Height': series1, 'Weight': series2, 'BMI': series3})
# Display the DataFrame
print(df)
import pandas as pd
# Create Series for height (in cm) and weight (in kg)
height = pd.Series([160, 175, 180, 170, 165], name='Height (cm)')
weight = pd.Series([60, 75, 80, 70, 68], name='Weight (kg)')
# Calculate BMI (weight in kg / (height in meters)^2)
# First, convert height from cm to meters (divide by 100)
height_meters = height / 100
# Calculate BMI
bmi = weight / (height_meters ** 2)
# Create a new Series for BMI
bmi_series = pd.Series(round(bmi), name='BMI')
# Combine height, weight, and BMI into a DataFrame
df = pd.concat([height, weight, bmi_series], axis=1)
# Display the DataFrame
print(df)
import pandas as pd
# Create a Series
Height = [10, 20, 30, 40, 50]
series = pd.Series(data, name='Height')
# Calculate mean, median, and standard deviation
mean_value = series.mean()
median_value = series.median()
std_deviation = round(series.std())
# Create a DataFrame to store the results
result_df = pd.DataFrame({
'Metric': ['Mean', 'Median', 'Standard Deviation'],
'Value': [mean_value, median_value, std_deviation]
})
# Display the DataFrame
print(result_df)
import pandas as pd
# Creating a DataFrame from a dictionary
data = {
'Name': ['Bhaskar', 'Gopinath', 'Senthil', 'Venkat'],
'Desg': ['DL', 'TL', 'DL', 'GM'],
'City': ['Chennai', 'Bangalore', 'Chennai', 'Delhi']
}
df = pd.DataFrame(data)
# Displaying the DataFrame
print(df)
import pandas as pd
# Create a list of lists where each inner list represents a row of data
data_list = [
['Alice', 25, 'Engineer'],
['Bob', 30, 'Designer'],
['Charlie', 22, 'Data Analyst'],
['David', 35, 'Manager']
]
# Create a DataFrame from the list
df = pd.DataFrame(data_list, columns=['Name', 'Age', 'Occupation'])
# Display the DataFrame
print(df)
import pandas as pd
# Create a list of tuples where each tuple represents a row of data
data_tuples = [
('Alice', 25, 'Engineer'),
('Bob', 30, 'Designer'),
('Charlie', 22, 'Data Analyst'),
('David', 35, 'Manager')
]
# Create a DataFrame from the list of tuples
df = pd.DataFrame(data_tuples, columns=['Name', 'Age', 'Occupation'])
# Display the DataFrame
print(df)
import pandas as pd
import numpy as np
# Create an array list where each array represents a column of data
array_list = [
np.array([1, 2, 3, 4]),
np.array(['Alice', 'Bob', 'Charlie', 'David']),
np.array([25, 30, 22, 35])
]
# Create a DataFrame from the array list
df = pd.DataFrame({'ID': array_list[0], 'Name': array_list[1], 'Age': array_list[2]})
# Display the DataFrame
print(df)
import pandas as pd
# Create multiple Series
series1 = pd.Series([178, 180, 165, 156, 189], name='Height')
series2 = pd.Series([80, 90, 70, 60, 85], name='Weight')
series3 = pd.Series([10.1, 20.2, 30.3, 40.4, 50.5], name='BMI')
# Create a DataFrame from the Series
df = pd.DataFrame({'Height': series1, 'Weight': series2, 'BMI': series3})
# Display the DataFrame
print(df)
# Descriptive analysis
print(df.describe())
import pandas as pd
data = {
'Name': ['Alice', 'Bob', 'Charlie', 'David'],
'Age': [25, 30, 35, 40],
'City': ['New York', 'Los Angeles', 'Chicago', 'Houston']
}
df = pd.DataFrame(data)
print(df) # Display DF with all columns
df = df.drop('City', axis=1) # Column Drop
print(df) # Display DF without city column
df = df.drop(index=1, axis=0) # Row Drop
print(df) # Display DF without Bob data
import pandas as pd
# Create a sample DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie'],
'Age': [25, 30, 22],
'City': ['New York', 'San Francisco', 'Los Angeles']}
df = pd.DataFrame(data)
print(df)
# Remove the 'City' column using del
del df['City']
print(df)
Difference between del and drop
Drop - is a function
Del - is a statement
Drop - Operates in both column and rows
Del - Operates only in column
Drop - Operates on multiple items at a time
Del - Operates one item at a time
import pandas as pd
# Creating a sample DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
'Age': [25, 30, 35, 40, 45]}
df = pd.DataFrame(data)
# Display the first 3 rows
print("First 3 records: \n",df.head(3))
# Display the last 2 rows
print("Last 3 records: \n",df.tail(2))
# Get the shape of the DataFrame
print("Shape of my dataframe: ",df.shape)
# Output: (5, 2) (5 rows, 2 columns)
import pandas as pd
# Creating a sample DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
'Age': [25, 30, 35, 40, 45],
'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Miami']}
df = pd.DataFrame(data)
# Indexing a single column using square brackets
name_column = df['Name']
# Indexing multiple columns
name_age_columns = df[['Name', 'Age']]
# Indexing a single column using dot notation
age_column = df.Age
# Displaying the selected columns
print("Name Column (Square Brackets):")
print(name_column)
# Displaying the selected multiple columns
print("Name and age Column (Square Brackets):")
print(name_age_columns)
# Displaying the selected column using dot notation
print("\nAge Column (Dot Notation):")
print(age_column)
# Slicing in DataFrame
import pandas as pd
# Creating a sample DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
'Age': [25, 30, 35, 40, 45],
'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Miami']}
df = pd.DataFrame(data)
# Slicing rows using loc[] (based on labels)
sliced_rows = df.loc[(df.City == "Chicago") | (df.Age >= 30)] # Rows 1 to 3 (inclusive)
# Slicing rows using iloc[] (based on indices)
sliced_rows_by_index = df.iloc[1:3] # Rows 1 to 2 (3 is excluded)
# Displaying the sliced rows
print("Sliced Rows (loc[]):")
print(sliced_rows)
print("\nSliced Rows (iloc[]):")
print(sliced_rows_by_index)
import pandas as pd
# Creating a sample DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
'Age': [30, 25, 35, 40, 30],
'Salary': [60000, 50000, 75000, 80000, 55000]}
df = pd.DataFrame(data)
# Sort the DataFrame by the 'Age' column in ascending order
sorted_df = df.sort_values(by='Age')
# Display the sorted DataFrame in ascending order
print(sorted_df)
# Sort the DataFrame by the 'Age' column in descending order
sorted_df = df.sort_values(by = "Age", ascending = False)
print(sorted_df)
import pandas as pd
df = pd.read_csv(r'C:\Users\ICTAcademy\Desktop\FDP\PMIST FDP\Python Script\User Data.csv')
print(df)
import pandas as pd
df = pd.read_excel(r'C:\Users\ICTAcademy\Desktop\FDP\PMIST FDP\Python Script\User Data.xlsx',sheet_name = "Sheet1")
print(df)
import pandas as p
import numpy as py
df = p.read_csv(r'C:\Users\ICTAcademy\Desktop\FDP\PMIST FDP\Python Script\StudentData.csv')
print(df)
df["Total"] = (df['Sub1'] + df['Sub2'] + df['Sub3'])
print(df)
df["Average"] = ((df['Sub1'] + df['Sub2'] + df['Sub3'])/3)
print(df)
df["percentage"] = round(((df['Sub1'] + df['Sub2'] + df['Sub3'])/300)*100)
print(df)
df.to_csv(r'C:\Users\ICTAcademy\Desktop\FDP\PMIST FDP\Python Script\StudentData.csv',index=False)
import pandas as p
import numpy as py
exist_file = r'C:\Users\ICTAcademy\Desktop\FDP\PMIST FDP\Python Script\StudentData.csv'
df = p.read_csv(exist_file)
print(df)
Sub4_values = [60,80,50,60,45,85,95,75,95,75]
Sub4_name = 'Sub4'
df.insert(4,Sub4_name,Sub4_values)
print(df)
df.to_csv(exist_file,index=False)
# Checking missing values
import pandas as p
import numpy as py
exist_file = r'C:\Users\ICTAcademy\Desktop\FDP\PMIST FDP\Python Script\StudentData.csv'
df = p.read_csv(exist_file)
# 1. Check for missing values
print("\n1. Checking for Missing Values:")
print(df.isnull())
# 2. Count missing values in each column
print("\n2. Counting Missing Values in Each Column:")
print(df.isnull().sum())
# 3. Remove rows with missing values
df_dropped = df.dropna()
print("\n3. DataFrame after Removing Rows with Missing Values:")
print(df_dropped) #
 3. Remove row s with missing values
df_dropped = df.dropna()
print("\n3. DataFrame after Removing Rows with Missing Values:")
print(df_dropped)
# 4. Fill missing values with a specific value (e.g., mean of the column)
mean_value = round(df.mean())
print("Mean Value is: ",mean_value)
df_filled = df.fillna(mean_value)
print("\n4. DataFrame after Filling Missing Values with Mean:")
print(df_filled)
# 5. Replace missing values with a custom value
df_custom_filled = df.fillna({'Sub1': 60, 'Sub2': 65, 'Sub3': 70, 'Sub4':75})
print("\n5. DataFrame after Custom Filling of Missing Values:")
print(df_custom_filled)
import pandas as pd
# Package support for odbc connections
import pyodbc as po
# Connection string for SQL Server
connection_string = (
'Driver={SQL Server};'
'Server=LAPTOP-SMO6VN72\SQLEXPRESS;'
'Database=AdventureWorksDW2020;'
)
# Establish a connection to SQL Server
connection = po.connect(connection_string)
# SQL query
sql_query = 'select ProductKey,EnglishProductName from dbo.DimProduct;'
# Execute the query and read data into a DataFrame
df = pd.read_sql(sql_query, connection)
connection.close()
print(df.head(10))
import pandas as pd # import the pandas module
# python list of numbers
data = pd.Series([60, 50, 65, 20, 45, 25, 65, 75, 25, 30, 40])
# creates a figure of size 20 inches wide and 10 inches high
data.plot(figsize=(20, 10))
# import the pandas module
import pandas as pd
# Creating a pandas dataframe
df = pd.DataFrame({'names': ['Bhaskar', 'Venkat', 'Sanjith', 'Vash'],
'Credit Points': [10000, 45000, 30000, 20000]})
# creates a bar graph of size 15 inches wide and 10 inches high
df.plot.bar(x='names', y='Credit Points', rot=90, figsize=(10, 5))
# import the pandas module
import pandas as pd
# Creating a pandas dataframe with index
df = pd.DataFrame({'value': [3.330, 4.87, 5.97]},
index=['A', 'B', 'C'])
df.plot.pie(y='value', figsize=(5, 5))
import pandas as pd
import matplotlib.pyplot as plt
# Sample data (replace with your own DataFrame)
data = pd.DataFrame({
'Year': [2010, 2011, 2012, 2013, 2014, 2015],
'Revenue': [1050, 1210, 1310, 1210, 1600, 1400]
})
# Create a line plot
data.plot(x='Year', y='Revenue', marker='s', linestyle='-')
plt.title('Revenue Over Time')
plt.xlabel('Year')
plt.ylabel('Revenue (INR)')
plt.grid(True)
plt.show()
# Sample data (replace with your own DataFrame)
data = pd.DataFrame({
'Category': ['A', 'B', 'C', 'D'],
'Count': [30, 45, 60, 25]
})
# Create a bar plot
data.plot(x='Category', y='Count', kind= 'bar', color='skyblue')
plt.title('Category Counts')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()
import matplotlib.pyplot as plt
import numpy as np
# make data
x = 0.5 + np.arange(8)
y = [4.8, 5.5, 3.5, 4.6, 6.5, 6.6, 2.6, 3.0]
# plot
fig, ax = plt.subplots()
ax.stem(x, y)
ax.set(xlim=(0, 8), xticks=np.arange(1, 8),
ylim=(0, 8), yticks=np.arange(1, 8))
plt.show()
import matplotlib.pyplot as plt
import numpy as np
# make data
x = np.arange(0, 10, 2)
ay = [1, 1.25, 2, 2.75, 3]
by = [1, 1, 1, 1, 1]
cy = [2, 1, 2, 1, 2]
y = np.vstack([ay, by, cy])
# plot
fig, ax = plt.subplots()
ax.stackplot(x, y)
ax.set(xlim=(0, 8), xticks=np.arange(1, 8),
ylim=(0, 8), yticks=np.arange(1, 8))
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
# Assuming you have a list or array of 50 student heights in centimeters
heights = [160, 165, 170, 155, 175, 180, 162, 168, 172, 163, 166, 169, 176, 161, 164,
158, 178, 173, 157, 171, 159, 167, 182, 174, 181, 177, 150, 183, 152, 179,
185, 154, 187, 151, 184, 156, 153, 186, 189, 148, 190, 147, 188, 149, 146,
191, 145, 144]
# Create a pandas DataFrame from the heights list
df = pd.DataFrame({'Heights (cm)': heights})
# Plot a histogram
plt.hist(df['Heights (cm)'], bins=10, edgecolor='black')
plt.title('Histogram of Student Heights')
plt.xlabel('Height (cm)')
plt.ylabel('Frequency')
# Show the plots
plt.show()
# Plot Box Plots
import pandas as p
import matplotlib.pyplot as plt
import seaborn as sns
df = p.read_csv('E:/Python Script/BMI_Chart.csv')
plt.figure(figsize=(8, 4))
sns.boxplot(data=df, palette='Set2')
plt.title('Box Plot of Columns Height, Weight, and BMI')
# Create a line plot using relplot
sns.relplot(x="total_bill", y="tip", kind="line", style = 'smoker', data=data)
# Set plot labels and title
plt.xlabel("Total Bill ($)")
plt.ylabel("Tip ($)")
plt.title("Line Plot of Total Bill vs. Tip")
# Show the plot
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
# Sample data
data = sns.load_dataset("tips")
# Create an lmplot
sns.lmplot(x="total_bill", y="tip", data=data)
# Set plot labels and title
plt.xlabel("Total Bill ($)")
plt.ylabel("Tip ($)")
plt.title("Scatter Plot with Regression Line")
# Show the plot
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
# Sample data
data = sns.load_dataset("tips")
# Create a stripplot
sns.stripplot(x="day", y="total_bill", data=data)
# Set plot labels and title
plt.xlabel("Day of the Week")
plt.ylabel("Total Bill ($)")
plt.title("Strip Plot of Total Bill by Day")
# Show the plot
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
# Sample data
data = sns.load_dataset("tips")
# Create a histogram using histplot
plt.figure(figsize=(8, 4))
sns.histplot(data=data, x="total_bill", color="skyblue")
plt.xlabel("Total Bill ($)")
plt.ylabel("Frequency")
plt.title("Histogram without KDE")
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
# Sample data
data = sns.load_dataset("tips")
# Create a KDE plot using kdeplot
plt.figure(figsize=(8, 4))
sns.kdeplot(data=data, x="total_bill", fill=True, color="salmon")
plt.xlabel("Total Bill ($)")
plt.ylabel("Density")
plt.title("KDE Plot")
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
# Sample data
data = sns.load_dataset("tips")
# Create a rug plot using rugplot
plt.figure(figsize=(8, 1))
sns.rugplot(data=data, x="total_bill", height=0.5, color="purple")
plt.xlabel("Total Bill ($)")
plt.title("Rug Plot")
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
# Sample data
data = sns.load_dataset("tips")
# Create a swarmplot
plt.figure(figsize=(8, 4))
sns.swarmplot(data=data, x="day", y="total_bill", palette="Set2")
plt.xlabel("Gender")
plt.ylabel("Total Bill ($)")
plt.title("Swarm Plot of Total Bill by Day")
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
# Sample data
data = sns.load_dataset("tips")
# Create a violin plot
plt.figure(figsize=(8, 4))
sns.violinplot(data=data, x="day", y="total_bill", palette="Set2")
plt.xlabel("Day of the Week")
plt.ylabel("Total Bill ($)")
plt.title("Violin Plot of Total Bill by Day")
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
# Sample data
data = sns.load_dataset("tips")
# Create a jointplot
sns.jointplot(data=data, x="total_bill", y="tip", kind="scatter")
plt.suptitle("Jointplot of Total Bill vs. Tip")
plt.show()