practise

to generate various visiualization using pandas and matplotlib.

 1import pandas as pd
 2import matplotlib.pyplot as plt
 3
 4# Sample Data
 5data = {
 6    'Month': ['Jan', 'Feb', 'Mar', 'Apr', 'May'],
 7    'Sales': [200, 250, 300, 280, 350],
 8    'Profit': [20, 30, 40, 35, 50]
 9}
10
11df = pd.DataFrame(data)
12
13# 1. Line Plot
14df.plot(x='Month', y='Sales', kind='line', marker='o')
15plt.title('Monthly Sales Trend')
16plt.xlabel('Month')
17plt.ylabel('Sales')
18plt.show()
19
20# 2. Vertical Bar Chart
21df.plot(x='Month', y='Sales', kind='bar')
22plt.title('Monthly Sales')
23plt.xlabel('Month')
24plt.ylabel('Sales')
25plt.show()
26
27# 3. Horizontal Bar Chart
28df.plot(x='Month', y='Sales', kind='barh')
29plt.title('Monthly Sales (Horizontal Bar Chart)')
30plt.xlabel('Sales')
31plt.ylabel('Month')
32plt.show()
33
34# 4. Histogram
35df['Sales'].plot(kind='hist', bins=5)
36plt.title('Sales Distribution')
37plt.xlabel('Sales')
38plt.ylabel('Frequency')
39plt.show()
40
41# 5. Pie Chart
42df.set_index('Month')['Sales'].plot(kind='pie', autopct='%1.1f%%')
43plt.title('Sales Contribution by Month')
44plt.ylabel('')
45plt.show()
46
47# 6. Scatter Plot
48df.plot(x='Sales', y='Profit', kind='scatter')
49plt.title('Sales vs Profit')
50plt.xlabel('Sales')
51plt.ylabel('Profit')
52plt.show()
53
54# 7. Box Plot
55df[['Sales', 'Profit']].plot(kind='box')
56plt.title('Box Plot of Sales and Profit')
57plt.ylabel('Values')
58plt.show()

to carry out simple multivariate analysis with a focus on pca and lda.

 1import pandas as pd
 2import matplotlib.pyplot as plt
 3import seaborn as sns
 4from sklearn.datasets import load_iris
 5from sklearn.decomposition import PCA
 6from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 7
 8# Load data
 9iris = load_iris()
10X, y = iris.data, iris.target
11df = pd.DataFrame(X, columns=iris.feature_names)
12
13# Profile Plot
14df.head().T.plot()
15plt.title("Profile Plot")
16plt.show()
17
18# Correlation Heatmap
19sns.heatmap(df.corr(), annot=True)
20plt.title("Correlation Heatmap")
21plt.show()
22
23# PCA Scree Plot
24pca = PCA()
25pca.fit(X)
26plt.plot(range(1,5), pca.explained_variance_ratio_, marker='o')
27plt.title("PCA Scree Plot")
28plt.show()
29
30# PCA Scatter Plot
31X_pca = PCA(n_components=2).fit_transform(X)
32plt.scatter(X_pca[:,0], X_pca[:,1], c=y)
33plt.title("PCA Scatter Plot")
34plt.show()
35
36# LDA Scatter Plot
37X_lda = LinearDiscriminantAnalysis(n_components=2).fit_transform(X, y)
38plt.scatter(X_lda[:,0], X_lda[:,1], c=y)
39plt.title("LDA Scatter Plot")
40plt.show()

financial data analysis using heat map, clustering, histogram.

 1import yfinance as yf
 2import pandas as pd
 3import matplotlib.pyplot as plt
 4import seaborn as sns
 5from scipy import stats
 6from sklearn.cluster import KMeans
 7
 8# --------------------------------------------------
 9# 1. Load Financial Data
10# --------------------------------------------------
11data = yf.download("AAPL", start="2020-01-01", end="2021-01-01")
12print(data.head())
13
14# --------------------------------------------------
15# 2. Data Cleaning
16# --------------------------------------------------
17data = data.dropna()
18
19# --------------------------------------------------
20# 3. Outlier Detection (Z-Score)
21# --------------------------------------------------
22z = stats.zscore(data['Close'])
23data = data[abs(z) < 3]
24
25# --------------------------------------------------
26# 4. Line Chart
27# --------------------------------------------------
28plt.plot(data.index, data['Close'])
29plt.title("Apple Closing Price")
30plt.show()
31
32# --------------------------------------------------
33# 5. Histogram
34# --------------------------------------------------
35plt.hist(data['Close'], bins=20)
36plt.title("Histogram of Closing Price")
37plt.show()
38
39# --------------------------------------------------
40# 6. Correlation Heatmap
41# --------------------------------------------------
42sns.heatmap(data.corr(), annot=True)
43plt.title("Correlation Heatmap")
44plt.show()
45
46# --------------------------------------------------
47# 7. Descriptive Statistics
48# --------------------------------------------------
49print(data.describe())
50
51# --------------------------------------------------
52# 8. Hypothesis Testing (T-Test)
53# --------------------------------------------------
54before = data[data.index < "2020-09-01"]['Close']
55after = data[data.index >= "2020-09-01"]['Close']
56
57t, p = stats.ttest_ind(before, after)
58print("T-Test:", t)
59print("P-Value:", p)
60
61# --------------------------------------------------
62# 9. Clustering
63# --------------------------------------------------
64X = data[['Open', 'Close']]
65
66kmeans = KMeans(n_clusters=3, random_state=0)
67data['Cluster'] = kmeans.fit_predict(X)
68
69plt.scatter(data['Open'], data['Close'],
70            c=data['Cluster'])
71plt.title("K-Means Clustering")
72plt.xlabel("Open")
73plt.ylabel("Close")
74plt.show()

time series analysis - stock market

 1import yfinance as yf
 2import matplotlib.pyplot as plt
 3import seaborn as sns
 4from scipy import stats
 5from sklearn.cluster import KMeans
 6from sklearn.linear_model import LinearRegression
 7from statsmodels.tsa.arima.model import ARIMA
 8
 9# ----------------------------------
10# 1. Load Financial Data
11# ----------------------------------
12data = yf.download("AAPL", start="2020-01-01", end="2021-01-01")
13data = data.dropna()
14
15# ----------------------------------
16# 2. Outlier Detection (Z-Score)
17# ----------------------------------
18z = stats.zscore(data['Close'])
19data = data[abs(z) < 3]
20
21# ----------------------------------
22# 3. Line Chart
23# ----------------------------------
24plt.plot(data.index, data['Close'])
25plt.title("Apple Stock Prices")
26plt.show()
27
28# ----------------------------------
29# 4. Histogram
30# ----------------------------------
31plt.hist(data['Close'], bins=20)
32plt.title("Histogram")
33plt.show()
34
35# ----------------------------------
36# 5. Correlation Heatmap
37# ----------------------------------
38sns.heatmap(data.corr(), annot=True)
39plt.title("Correlation Heatmap")
40plt.show()
41
42# ----------------------------------
43# 6. Descriptive Statistics
44# ----------------------------------
45print(data.describe())
46
47# ----------------------------------
48# 7. Hypothesis Testing (T-Test)
49# ----------------------------------
50before = data[data.index < "2020-09-01"]['Close']
51after = data[data.index >= "2020-09-01"]['Close']
52
53t, p = stats.ttest_ind(before, after)
54print("T-Statistic:", t)
55print("P-Value:", p)
56
57# ----------------------------------
58# 8. K-Means Clustering
59# ----------------------------------
60X = data[['Open', 'Close']]
61
62kmeans = KMeans(n_clusters=3, random_state=0, n_init=10)
63data['Cluster'] = kmeans.fit_predict(X)
64
65plt.scatter(data['Open'], data['Close'],
66            c=data['Cluster'])
67plt.xlabel("Open")
68plt.ylabel("Close")
69plt.title("K-Means Clustering")
70plt.show()
71
72# ----------------------------------
73# 9. Linear Regression
74# ----------------------------------
75X = data[['Volume']]
76y = data['Close']
77
78model = LinearRegression()
79model.fit(X, y)
80
81print("Intercept:", model.intercept_)
82print("Coefficient:", model.coef_[0])
83
84# ----------------------------------
85# 10. Time Series Forecasting (ARIMA)
86# ----------------------------------
87ts = data[['Close']].copy()
88ts = ts.asfreq('B')
89ts['Close'] = ts['Close'].ffill()
90
91arima = ARIMA(ts['Close'], order=(1,0,0))
92result = arima.fit()
93
94forecast = result.forecast(5)
95print("Forecast:")
96print(forecast)

visualization of streaming dataset.

weather forecasting

 1import requests
 2import matplotlib.pyplot as plt
 3import time
 4
 5API_KEY = "YOUR_API_KEY"
 6CITY = "Hyderabad"
 7
 8temps = []
 9
10for i in range(5):      # collect 5 readings
11
12    url = f"https://api.openweathermap.org/data/2.5/weather?q={CITY}&appid={API_KEY}&units=metric"
13
14    data = requests.get(url).json()
15
16    if "main" in data:
17        temps.append(data["main"]["temp"])
18
19    time.sleep(2)
20
21plt.plot(temps, marker='o')
22plt.title("Streaming Weather Data")
23plt.xlabel("Reading")
24plt.ylabel("Temperature")
25plt.show()

stock market code

 1import yfinance as yf
 2import matplotlib.pyplot as plt
 3import time
 4
 5prices = []
 6
 7for i in range(5):      # collect 5 readings
 8
 9    stock = yf.Ticker("AAPL")
10    price = stock.history(period="1d")["Close"].iloc[-1]
11
12    prices.append(price)
13
14    time.sleep(2)
15
16plt.plot(prices, marker='o')
17plt.title("Streaming Stock Prices")
18plt.xlabel("Reading")
19plt.ylabel("Price")
20plt.show()

market based data analysis and visualization

 1import yfinance as yf
 2import matplotlib.pyplot as plt
 3
 4# ----------------------------------
 5# 1. Load Market Data
 6# ----------------------------------
 7data = yf.Ticker("TCS.NS").history(period="1y")
 8
 9# ----------------------------------
10# 2. Basic Statistics
11# ----------------------------------
12print(data.describe())
13print(data.isnull().sum())
14
15# ----------------------------------
16# 3. Daily Returns
17# ----------------------------------
18data['Daily Return'] = data['Close'].pct_change()
19
20# ----------------------------------
21# 4. Closing Price Trend
22# ----------------------------------
23plt.plot(data['Close'])
24plt.title("Closing Price Trend")
25plt.show()
26
27# ----------------------------------
28# 5. Moving Average
29# ----------------------------------
30data['MA50'] = data['Close'].rolling(50).mean()
31
32plt.plot(data['Close'], label='Close')
33plt.plot(data['MA50'], label='MA50')
34plt.legend()
35plt.title("Moving Average Analysis")
36plt.show()
37
38# ----------------------------------
39# 6. Daily Returns Visualization
40# ----------------------------------
41plt.plot(data['Daily Return'])
42plt.title("Daily Returns")
43plt.show()
44
45# ----------------------------------
46# 7. Histogram of Returns
47# ----------------------------------
48plt.hist(data['Daily Return'].dropna())
49plt.title("Return Distribution")
50plt.show()

text visualization using web analytics.

 1import requests
 2from bs4 import BeautifulSoup
 3from collections import Counter
 4import pandas as pd
 5import matplotlib.pyplot as plt
 6from wordcloud import WordCloud
 7from nltk.corpus import stopwords
 8import nltk
 9
10nltk.download('stopwords')
11
12# Extract text
13url = "https://example.com"
14html = requests.get(url).text
15
16soup = BeautifulSoup(html, "html.parser")
17text = " ".join([p.get_text() for p in soup.find_all('p')])
18
19# Preprocessing
20words = text.lower().split()
21
22stop_words = set(stopwords.words('english'))
23words = [w for w in words if w.isalnum() and w not in stop_words]
24
25# Frequency Analysis
26freq = Counter(words)
27
28df = pd.DataFrame(freq.items(),
29                  columns=['Word','Frequency'])
30
31df = df.sort_values(by='Frequency',
32                    ascending=False).head(10)
33
34# Bar Chart
35plt.bar(df['Word'], df['Frequency'])
36plt.title("Top 10 Words")
37plt.xticks(rotation=45)
38plt.show()
39
40# Word Cloud
41wc = WordCloud().generate(text)
42plt.imshow(wc)
43plt.axis("off")
44plt.show()
45
46# Pie Chart
47plt.pie(df['Frequency'],
48        labels=df['Word'],
49        autopct='%1.1f%%')
50plt.show()

visualization of various massive datasets - finance, healthcare, census, geospatial.

 1import pandas as pd
 2import matplotlib.pyplot as plt
 3
 4# Finance Data
 5plt.plot([2020,2021,2022,2023],
 6         [100,150,180,250])
 7plt.title("Finance Data")
 8plt.show()
 9
10# Healthcare Data
11plt.bar(['Diabetes','Cancer','Asthma'],
12        [200,150,180])
13plt.title("Healthcare Data")
14plt.show()
15
16# Census Data
17plt.pie([500,700,600],
18        labels=['0-18','19-35','36-60'],
19        autopct='%1.1f%%')
20plt.title("Census Data")
21plt.show()
22
23# Geospatial Data
24plt.scatter([72.87,77.20,77.59],
25            [19.07,28.61,12.97])
26plt.title("Geospatial Data")
27plt.show()