to generate various visiualization using pandas and matplotlib.
1import pandas as pd
2import matplotlib.pyplot as plt
3
4# Sample Data
5data = {
6 'Month': ['Jan', 'Feb', 'Mar', 'Apr', 'May'],
7 'Sales': [200, 250, 300, 280, 350],
8 'Profit': [20, 30, 40, 35, 50]
9}
10
11df = pd.DataFrame(data)
12
13# 1. Line Plot
14df.plot(x='Month', y='Sales', kind='line', marker='o')
15plt.title('Monthly Sales Trend')
16plt.xlabel('Month')
17plt.ylabel('Sales')
18plt.show()
19
20# 2. Vertical Bar Chart
21df.plot(x='Month', y='Sales', kind='bar')
22plt.title('Monthly Sales')
23plt.xlabel('Month')
24plt.ylabel('Sales')
25plt.show()
26
27# 3. Horizontal Bar Chart
28df.plot(x='Month', y='Sales', kind='barh')
29plt.title('Monthly Sales (Horizontal Bar Chart)')
30plt.xlabel('Sales')
31plt.ylabel('Month')
32plt.show()
33
34# 4. Histogram
35df['Sales'].plot(kind='hist', bins=5)
36plt.title('Sales Distribution')
37plt.xlabel('Sales')
38plt.ylabel('Frequency')
39plt.show()
40
41# 5. Pie Chart
42df.set_index('Month')['Sales'].plot(kind='pie', autopct='%1.1f%%')
43plt.title('Sales Contribution by Month')
44plt.ylabel('')
45plt.show()
46
47# 6. Scatter Plot
48df.plot(x='Sales', y='Profit', kind='scatter')
49plt.title('Sales vs Profit')
50plt.xlabel('Sales')
51plt.ylabel('Profit')
52plt.show()
53
54# 7. Box Plot
55df[['Sales', 'Profit']].plot(kind='box')
56plt.title('Box Plot of Sales and Profit')
57plt.ylabel('Values')
58plt.show()
to carry out simple multivariate analysis with a focus on pca and lda.
1import pandas as pd
2import matplotlib.pyplot as plt
3import seaborn as sns
4from sklearn.datasets import load_iris
5from sklearn.decomposition import PCA
6from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
7
8# Load data
9iris = load_iris()
10X, y = iris.data, iris.target
11df = pd.DataFrame(X, columns=iris.feature_names)
12
13# Profile Plot
14df.head().T.plot()
15plt.title("Profile Plot")
16plt.show()
17
18# Correlation Heatmap
19sns.heatmap(df.corr(), annot=True)
20plt.title("Correlation Heatmap")
21plt.show()
22
23# PCA Scree Plot
24pca = PCA()
25pca.fit(X)
26plt.plot(range(1,5), pca.explained_variance_ratio_, marker='o')
27plt.title("PCA Scree Plot")
28plt.show()
29
30# PCA Scatter Plot
31X_pca = PCA(n_components=2).fit_transform(X)
32plt.scatter(X_pca[:,0], X_pca[:,1], c=y)
33plt.title("PCA Scatter Plot")
34plt.show()
35
36# LDA Scatter Plot
37X_lda = LinearDiscriminantAnalysis(n_components=2).fit_transform(X, y)
38plt.scatter(X_lda[:,0], X_lda[:,1], c=y)
39plt.title("LDA Scatter Plot")
40plt.show()
financial data analysis using heat map, clustering, histogram.
1import yfinance as yf
2import pandas as pd
3import matplotlib.pyplot as plt
4import seaborn as sns
5from scipy import stats
6from sklearn.cluster import KMeans
7
8# --------------------------------------------------
9# 1. Load Financial Data
10# --------------------------------------------------
11data = yf.download("AAPL", start="2020-01-01", end="2021-01-01")
12print(data.head())
13
14# --------------------------------------------------
15# 2. Data Cleaning
16# --------------------------------------------------
17data = data.dropna()
18
19# --------------------------------------------------
20# 3. Outlier Detection (Z-Score)
21# --------------------------------------------------
22z = stats.zscore(data['Close'])
23data = data[abs(z) < 3]
24
25# --------------------------------------------------
26# 4. Line Chart
27# --------------------------------------------------
28plt.plot(data.index, data['Close'])
29plt.title("Apple Closing Price")
30plt.show()
31
32# --------------------------------------------------
33# 5. Histogram
34# --------------------------------------------------
35plt.hist(data['Close'], bins=20)
36plt.title("Histogram of Closing Price")
37plt.show()
38
39# --------------------------------------------------
40# 6. Correlation Heatmap
41# --------------------------------------------------
42sns.heatmap(data.corr(), annot=True)
43plt.title("Correlation Heatmap")
44plt.show()
45
46# --------------------------------------------------
47# 7. Descriptive Statistics
48# --------------------------------------------------
49print(data.describe())
50
51# --------------------------------------------------
52# 8. Hypothesis Testing (T-Test)
53# --------------------------------------------------
54before = data[data.index < "2020-09-01"]['Close']
55after = data[data.index >= "2020-09-01"]['Close']
56
57t, p = stats.ttest_ind(before, after)
58print("T-Test:", t)
59print("P-Value:", p)
60
61# --------------------------------------------------
62# 9. Clustering
63# --------------------------------------------------
64X = data[['Open', 'Close']]
65
66kmeans = KMeans(n_clusters=3, random_state=0)
67data['Cluster'] = kmeans.fit_predict(X)
68
69plt.scatter(data['Open'], data['Close'],
70 c=data['Cluster'])
71plt.title("K-Means Clustering")
72plt.xlabel("Open")
73plt.ylabel("Close")
74plt.show()
time series analysis - stock market
1import yfinance as yf
2import matplotlib.pyplot as plt
3import seaborn as sns
4from scipy import stats
5from sklearn.cluster import KMeans
6from sklearn.linear_model import LinearRegression
7from statsmodels.tsa.arima.model import ARIMA
8
9# ----------------------------------
10# 1. Load Financial Data
11# ----------------------------------
12data = yf.download("AAPL", start="2020-01-01", end="2021-01-01")
13data = data.dropna()
14
15# ----------------------------------
16# 2. Outlier Detection (Z-Score)
17# ----------------------------------
18z = stats.zscore(data['Close'])
19data = data[abs(z) < 3]
20
21# ----------------------------------
22# 3. Line Chart
23# ----------------------------------
24plt.plot(data.index, data['Close'])
25plt.title("Apple Stock Prices")
26plt.show()
27
28# ----------------------------------
29# 4. Histogram
30# ----------------------------------
31plt.hist(data['Close'], bins=20)
32plt.title("Histogram")
33plt.show()
34
35# ----------------------------------
36# 5. Correlation Heatmap
37# ----------------------------------
38sns.heatmap(data.corr(), annot=True)
39plt.title("Correlation Heatmap")
40plt.show()
41
42# ----------------------------------
43# 6. Descriptive Statistics
44# ----------------------------------
45print(data.describe())
46
47# ----------------------------------
48# 7. Hypothesis Testing (T-Test)
49# ----------------------------------
50before = data[data.index < "2020-09-01"]['Close']
51after = data[data.index >= "2020-09-01"]['Close']
52
53t, p = stats.ttest_ind(before, after)
54print("T-Statistic:", t)
55print("P-Value:", p)
56
57# ----------------------------------
58# 8. K-Means Clustering
59# ----------------------------------
60X = data[['Open', 'Close']]
61
62kmeans = KMeans(n_clusters=3, random_state=0, n_init=10)
63data['Cluster'] = kmeans.fit_predict(X)
64
65plt.scatter(data['Open'], data['Close'],
66 c=data['Cluster'])
67plt.xlabel("Open")
68plt.ylabel("Close")
69plt.title("K-Means Clustering")
70plt.show()
71
72# ----------------------------------
73# 9. Linear Regression
74# ----------------------------------
75X = data[['Volume']]
76y = data['Close']
77
78model = LinearRegression()
79model.fit(X, y)
80
81print("Intercept:", model.intercept_)
82print("Coefficient:", model.coef_[0])
83
84# ----------------------------------
85# 10. Time Series Forecasting (ARIMA)
86# ----------------------------------
87ts = data[['Close']].copy()
88ts = ts.asfreq('B')
89ts['Close'] = ts['Close'].ffill()
90
91arima = ARIMA(ts['Close'], order=(1,0,0))
92result = arima.fit()
93
94forecast = result.forecast(5)
95print("Forecast:")
96print(forecast)
visualization of streaming dataset.
weather forecasting
1import requests
2import matplotlib.pyplot as plt
3import time
4
5API_KEY = "YOUR_API_KEY"
6CITY = "Hyderabad"
7
8temps = []
9
10for i in range(5): # collect 5 readings
11
12 url = f"https://api.openweathermap.org/data/2.5/weather?q={CITY}&appid={API_KEY}&units=metric"
13
14 data = requests.get(url).json()
15
16 if "main" in data:
17 temps.append(data["main"]["temp"])
18
19 time.sleep(2)
20
21plt.plot(temps, marker='o')
22plt.title("Streaming Weather Data")
23plt.xlabel("Reading")
24plt.ylabel("Temperature")
25plt.show()
stock market code
1import yfinance as yf
2import matplotlib.pyplot as plt
3import time
4
5prices = []
6
7for i in range(5): # collect 5 readings
8
9 stock = yf.Ticker("AAPL")
10 price = stock.history(period="1d")["Close"].iloc[-1]
11
12 prices.append(price)
13
14 time.sleep(2)
15
16plt.plot(prices, marker='o')
17plt.title("Streaming Stock Prices")
18plt.xlabel("Reading")
19plt.ylabel("Price")
20plt.show()
market based data analysis and visualization
1import yfinance as yf
2import matplotlib.pyplot as plt
3
4# ----------------------------------
5# 1. Load Market Data
6# ----------------------------------
7data = yf.Ticker("TCS.NS").history(period="1y")
8
9# ----------------------------------
10# 2. Basic Statistics
11# ----------------------------------
12print(data.describe())
13print(data.isnull().sum())
14
15# ----------------------------------
16# 3. Daily Returns
17# ----------------------------------
18data['Daily Return'] = data['Close'].pct_change()
19
20# ----------------------------------
21# 4. Closing Price Trend
22# ----------------------------------
23plt.plot(data['Close'])
24plt.title("Closing Price Trend")
25plt.show()
26
27# ----------------------------------
28# 5. Moving Average
29# ----------------------------------
30data['MA50'] = data['Close'].rolling(50).mean()
31
32plt.plot(data['Close'], label='Close')
33plt.plot(data['MA50'], label='MA50')
34plt.legend()
35plt.title("Moving Average Analysis")
36plt.show()
37
38# ----------------------------------
39# 6. Daily Returns Visualization
40# ----------------------------------
41plt.plot(data['Daily Return'])
42plt.title("Daily Returns")
43plt.show()
44
45# ----------------------------------
46# 7. Histogram of Returns
47# ----------------------------------
48plt.hist(data['Daily Return'].dropna())
49plt.title("Return Distribution")
50plt.show()
text visualization using web analytics.
1import requests
2from bs4 import BeautifulSoup
3from collections import Counter
4import pandas as pd
5import matplotlib.pyplot as plt
6from wordcloud import WordCloud
7from nltk.corpus import stopwords
8import nltk
9
10nltk.download('stopwords')
11
12# Extract text
13url = "https://example.com"
14html = requests.get(url).text
15
16soup = BeautifulSoup(html, "html.parser")
17text = " ".join([p.get_text() for p in soup.find_all('p')])
18
19# Preprocessing
20words = text.lower().split()
21
22stop_words = set(stopwords.words('english'))
23words = [w for w in words if w.isalnum() and w not in stop_words]
24
25# Frequency Analysis
26freq = Counter(words)
27
28df = pd.DataFrame(freq.items(),
29 columns=['Word','Frequency'])
30
31df = df.sort_values(by='Frequency',
32 ascending=False).head(10)
33
34# Bar Chart
35plt.bar(df['Word'], df['Frequency'])
36plt.title("Top 10 Words")
37plt.xticks(rotation=45)
38plt.show()
39
40# Word Cloud
41wc = WordCloud().generate(text)
42plt.imshow(wc)
43plt.axis("off")
44plt.show()
45
46# Pie Chart
47plt.pie(df['Frequency'],
48 labels=df['Word'],
49 autopct='%1.1f%%')
50plt.show()
visualization of various massive datasets - finance, healthcare, census, geospatial.
1import pandas as pd
2import matplotlib.pyplot as plt
3
4# Finance Data
5plt.plot([2020,2021,2022,2023],
6 [100,150,180,250])
7plt.title("Finance Data")
8plt.show()
9
10# Healthcare Data
11plt.bar(['Diabetes','Cancer','Asthma'],
12 [200,150,180])
13plt.title("Healthcare Data")
14plt.show()
15
16# Census Data
17plt.pie([500,700,600],
18 labels=['0-18','19-35','36-60'],
19 autopct='%1.1f%%')
20plt.title("Census Data")
21plt.show()
22
23# Geospatial Data
24plt.scatter([72.87,77.20,77.59],
25 [19.07,28.61,12.97])
26plt.title("Geospatial Data")
27plt.show()