import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.frequent_patterns import fpgrowth, association_rules
from mlxtend.preprocessing import TransactionEncoder

df = pd.read_csv('Airlines.csv')
df

df.head(5)

df.tail(5)

df.sample(5)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 539383 entries, 0 to 539382
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   id           539383 non-null  int64 
 1   Airline      539383 non-null  object
 2   Flight       539383 non-null  int64 
 3   AirportFrom  539383 non-null  object
 4   AirportTo    539383 non-null  object
 5   DayOfWeek    539383 non-null  int64 
 6   Time         539383 non-null  int64 
 7   Length       539383 non-null  int64 
 8   Delay        539383 non-null  int64 
dtypes: int64(6), object(3)
memory usage: 37.0+ MB

mean_Time = df['Time'].mean()
print(mean_Time)

802.7289625368245

mean_length = df['Length'].mean()
print(mean_length)

132.20200673732765

median_Time = df['Time'].median()
print(median_Time)

795.0

median_length = df['Length'].median()
print(median_length)

115.0

Most_going_airline = df['Airline'].mode()[0]
print(Most_going_airline)

WN

Most_going_flight_number = df['Flight'].mode()[0]
print(Most_going_flight_number)

16

max_Time = df['Time'].max()
print(max_Time)

1439

max_Length = df['Length'].max()
print(max_Length)

655

min_Time = df['Time'].min()
print(min_Time)

10

min_Length = df['Length'].min()
print(min_Length)

0

Top_25_Time = df['Time'].quantile(0.25)
print(Top_25_Time)

565.0

Top_25_Length = df['Length'].quantile(0.25)
print(Top_25_Length)

81.0

Top_75_Time = df['Time'].quantile(0.75)
print(Top_75_Time)

1035.0

Top_75_Length = df['Length'].quantile(0.75)
print(Top_75_Length)

162.0

Standard_dev_Time = df['Time'].std()
print(Standard_dev_Time)
print("Variance Time is ",(Standard_dev_Time)**2)

278.04591081679
Variance Time is  77309.52852193835

Standard_dev_Length = df['Length'].std()
print(Standard_dev_Length)
print("Variance Length is ",(Standard_dev_Length)**2)

70.11701559746602
Variance Length is  4916.395876295293

df.describe()

# Checking for null values
df.isna().sum()

id             0
Airline        0
Flight         0
AirportFrom    0
AirportTo      0
DayOfWeek      0
Time           0
Length         0
Delay          0
dtype: int64

# Checking for duplicate values 
df.duplicated().sum()

np.int64(0)

# Checking for duplicate values in 'id' column as id should be unique
df['id'].duplicated().sum()

np.int64(0)

day = { 1: 'Monday',2: 'Tuesday',3: 'Wednesday',4: 'Thursday',5: 'Friday',6: 'Saturday',7: 'Sunday'}
df['Day of Week'] = df['DayOfWeek'].replace(day)

df = df.drop(columns = 'DayOfWeek')

def categorize_time(minutes):
    if 360 <= minutes < 720:
        return 'Morning'
    elif 720 <= minutes < 1080:
        return 'Afternoon'
    elif 1080 <= minutes < 1440:
        return 'Evening'
    else:
        return 'Night'

df['Time_Category'] = df['Time'].apply(categorize_time)

df = df.rename(columns={'AirportFrom': 'Origin Airport'})

df = df.rename(columns={'AirportTo': 'Destination Airport'})

df = df.rename(columns={'Length': 'Distance'})

df['Route'] = df['Origin Airport'].astype(str) + '-' + df['Destination Airport'].astype(str)
df['Route'] = df['Route'].astype('category')

df = df.drop(columns = 'id')

df['Airline'].unique()

array(['CO', 'US', 'AA', 'AS', 'DL', 'B6', 'HA', 'OO', '9E', 'OH', 'EV',
       'XE', 'YV', 'UA', 'MQ', 'FL', 'F9', 'WN'], dtype=object)

df['Origin Airport'].unique()

array(['SFO', 'PHX', 'LAX', 'ANC', 'LAS', 'SLC', 'DEN', 'ONT', 'FAI',
       'BQN', 'PSE', 'HNL', 'BIS', 'IYK', 'EWR', 'BOS', 'MKE', 'GFK',
       'OMA', 'GSO', 'LMT', 'SEA', 'MCO', 'TPA', 'DLH', 'MSP', 'FAR',
       'MFE', 'MSY', 'VPS', 'BWI', 'MAF', 'LWS', 'RST', 'ALB', 'DSM',
       'CHS', 'MSN', 'JAX', 'SAT', 'PNS', 'BHM', 'LIT', 'SAV', 'BNA',
       'ICT', 'ECP', 'DHN', 'MGM', 'CAE', 'PWM', 'ACV', 'EKO', 'PHL',
       'ATL', 'PDX', 'RIC', 'BTR', 'HRL', 'MYR', 'TUS', 'SBN', 'CAK',
       'TVC', 'CLE', 'ORD', 'DAY', 'MFR', 'BTV', 'TLH', 'TYS', 'DFW',
       'FLL', 'AUS', 'CHA', 'CMH', 'LRD', 'BRO', 'CRP', 'LAN', 'PVD',
       'FWA', 'JFK', 'LGA', 'OKC', 'PIT', 'PBI', 'ORF', 'DCA', 'AEX',
       'SYR', 'SHV', 'VLD', 'BDL', 'FAT', 'BZN', 'RDM', 'LFT', 'IPL',
       'EAU', 'ERI', 'BUF', 'IAH', 'MCI', 'AGS', 'ABI', 'GRR', 'LBB',
       'CLT', 'LEX', 'MBS', 'MOD', 'AMA', 'SGF', 'AZO', 'ABE', 'SWF',
       'BGM', 'AVP', 'FNT', 'GSP', 'ATW', 'ITH', 'TUL', 'COS', 'ELP',
       'ABQ', 'SMF', 'STL', 'IAD', 'DTW', 'RDU', 'RSW', 'OAK', 'ROC',
       'IND', 'CVG', 'MDW', 'SDF', 'ABY', 'TRI', 'XNA', 'ROA', 'MLI',
       'LYH', 'EVV', 'HPN', 'FAY', 'EWN', 'CSG', 'GPT', 'MLU', 'MOB',
       'OAJ', 'CHO', 'ILM', 'BMI', 'PHF', 'ACY', 'JAN', 'CID', 'GRK',
       'HOU', 'CRW', 'HTS', 'PSC', 'BOI', 'SBP', 'CLD', 'PSP', 'SBA',
       'MEM', 'MRY', 'GEG', 'RDD', 'PAH', 'CMX', 'SPI', 'EUG', 'CIC',
       'PIH', 'SGU', 'COD', 'MIA', 'MHT', 'GRB', 'FSD', 'SJU', 'AVL',
       'BFL', 'RAP', 'DRO', 'PIA', 'OGG', 'SIT', 'TXK', 'RNO', 'DAL',
       'SCE', 'MEI', 'MDT', 'FCA', 'SJC', 'KOA', 'PLN', 'SAN', 'GNV',
       'HLN', 'GJT', 'CPR', 'FSM', 'CMI', 'GTF', 'HDN', 'ITO', 'MTJ',
       'HSV', 'BTM', 'BIL', 'COU', 'MSO', 'SMX', 'TWF', 'ISP', 'GCC',
       'LIH', 'LNK', 'DAB', 'SNA', 'MQT', 'LGB', 'CWA', 'LSE', 'BUR',
       'ACT', 'MHK', 'MOT', 'IDA', 'SUN', 'GTR', 'MLB', 'SRQ', 'JAC',
       'ASE', 'LCH', 'JNU', 'ROW', 'BQK', 'YUM', 'FLG', 'EGE', 'GUC',
       'EYW', 'RKS', 'BGR', 'ELM', 'ADQ', 'OTZ', 'OTH', 'STT', 'KTN',
       'BET', 'SJT', 'CDC', 'CEC', 'SPS', 'SCC', 'STX', 'OME', 'MKG',
       'WRG', 'TYR', 'BRW', 'GGG', 'PSG', 'BKG', 'YAK', 'CLL', 'SAF',
       'CYS', 'LWB', 'CDV', 'FLO', 'BLI', 'DBQ', 'TOL', 'UTM', 'PIE',
       'ADK', 'ABR', 'TEX', 'MMH', 'GUM'], dtype=object)

df['Destination Airport'].unique()

array(['IAH', 'CLT', 'DFW', 'SEA', 'MSP', 'DTW', 'ORD', 'ATL', 'PDX',
       'JFK', 'SLC', 'HNL', 'PHX', 'MCO', 'OGG', 'LAX', 'KOA', 'ITO',
       'SFO', 'MIA', 'IAD', 'SMF', 'PHL', 'LIH', 'DEN', 'LGA', 'MEM',
       'CVG', 'YUM', 'CWA', 'MKE', 'BQN', 'FAI', 'LAS', 'ANC', 'BOS',
       'LGB', 'FLL', 'SJU', 'EWR', 'DCA', 'BWI', 'RDU', 'MCI', 'TYS',
       'SAN', 'ONT', 'OAK', 'MDW', 'BNA', 'DAL', 'CLE', 'JAX', 'JNU',
       'RNO', 'ELP', 'SAT', 'OTZ', 'MBS', 'BDL', 'STL', 'HOU', 'AUS',
       'SNA', 'SJC', 'LIT', 'TUS', 'TUL', 'CMH', 'LAN', 'IND', 'AMA',
       'CRP', 'PIT', 'RKS', 'FWA', 'TPA', 'PBI', 'JAN', 'DSM', 'ADQ',
       'GRB', 'PVD', 'ABQ', 'SDF', 'RSW', 'MSY', 'BUR', 'BOI', 'TLH',
       'BHM', 'ACV', 'ORF', 'BET', 'KTN', 'RIC', 'SRQ', 'BTR', 'XNA',
       'MHT', 'GRR', 'SBN', 'SBA', 'ROA', 'CID', 'GPT', 'MFR', 'SGU',
       'HPN', 'OMA', 'OTH', 'GSP', 'LMT', 'BUF', 'MSN', 'BFL', 'CAE',
       'HRL', 'OKC', 'SYR', 'COS', 'BTV', 'CDC', 'SCC', 'DAY', 'SJT',
       'TVC', 'ROC', 'ISP', 'MRY', 'SBP', 'MLI', 'MOB', 'CIC', 'SAV',
       'FAT', 'EKO', 'GEG', 'ECP', 'LFT', 'SUN', 'HSV', 'SHV', 'CHA',
       'CAK', 'BZN', 'MAF', 'GSO', 'MDT', 'PHF', 'ICT', 'AZO', 'RAP',
       'CHS', 'CLD', 'MKG', 'VPS', 'PIH', 'ATW', 'AGS', 'PNS', 'BIL',
       'SPI', 'FAR', 'CPR', 'PIA', 'SPS', 'TWF', 'LBB', 'ALB', 'CEC',
       'DRO', 'GJT', 'GNV', 'RST', 'AVL', 'GRK', 'PSP', 'LEX', 'TRI',
       'SGF', 'FSM', 'RDD', 'OME', 'MFE', 'LSE', 'BMI', 'MYR', 'FAY',
       'FSD', 'EUG', 'MGM', 'EVV', 'MLB', 'FNT', 'STT', 'WRG', 'ABE',
       'BIS', 'MOT', 'MLU', 'GFK', 'RDM', 'COU', 'LRD', 'PSC', 'MOD',
       'PWM', 'ILM', 'ABY', 'CRW', 'TXK', 'BRO', 'BRW', 'EYW', 'DAB',
       'ROW', 'ABI', 'EAU', 'TYR', 'MSO', 'FLG', 'CSG', 'VLD', 'DHN',
       'OAJ', 'AEX', 'CHO', 'SAF', 'GGG', 'FCA', 'ASE', 'BKG', 'MHK',
       'LNK', 'MQT', 'YAK', 'GTR', 'SMX', 'SWF', 'ITH', 'AVP', 'ELM',
       'BGM', 'SIT', 'PSG', 'CYS', 'CLL', 'SCE', 'LWB', 'LCH', 'GCC',
       'IYK', 'LWS', 'COD', 'HLN', 'BQK', 'GTF', 'DLH', 'BTM', 'EGE',
       'IDA', 'JAC', 'HDN', 'MTJ', 'CMX', 'CMI', 'CDV', 'LYH', 'ACT',
       'STX', 'IPL', 'PAH', 'HTS', 'MEI', 'BLI', 'ERI', 'EWN', 'FLO',
       'ACY', 'DBQ', 'TOL', 'GUC', 'PLN', 'BGR', 'PSE', 'PIE', 'UTM',
       'ADK', 'ABR', 'TEX', 'MMH', 'GUM'], dtype=object)

# 4. Statistical Analysis: Mode and Frequency
time_mode = df['Time_Category'].mode()[0]
category_counts = df['Time_Category'].value_counts()

print(f"Most Frequent Flying Period (Mode): {time_mode}")
print("\nFlight Volume per Category:")
print(category_counts)

Most Frequent Flying Period (Mode): Morning

Flight Volume per Category:
Time_Category
Morning      218449
Afternoon    206930
Evening      107828
Night          6176
Name: count, dtype: int64

delay_stats = df.groupby('Time_Category')['Delay'].mean().sort_values(ascending=False)
print("\nAverage Delay Probability per Time Block:")
print(delay_stats)

Average Delay Probability per Time Block:
Time_Category
Evening      0.514356
Afternoon    0.500913
Morning      0.363577
Night        0.279307
Name: Delay, dtype: float64

plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()

plt.figure(figsize=(12, 6))
sns.countplot(x='Airline', data=df,hue = 'Delay', order=df['Airline'].value_counts().index, palette='plasma',legend=True)
plt.title('Number of Flights by Airline')
plt.xticks(rotation=45)
plt.show()

day_of_week_counts = df['Day of Week'].value_counts()
plt.figure(figsize=(10, 6))
sns.countplot(x='Day of Week', data=df,hue ='Day of Week',  order=day_of_week_counts.index, palette='crest',legend=False)
plt.title('Number of Flights by Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Number of Flights')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

df['Airline'].value_counts().plot(kind='pie',autopct='%.2f')
plt.title('Number of each Airline')

Text(0.5, 1.0, 'Number of each Airline')

plt.figure(figsize=(10, 6))
sns.barplot(x='Time_Category', y='Delay', data=df, 
            order=['Morning', 'Afternoon', 'Evening', 'Night'], palette='magma')
plt.title('Average Delay Probability by Time of Day', fontsize=14)
plt.xlabel('Time Category')
plt.ylabel('Delay Rate')
plt.show()

C:\Users\amitm\AppData\Local\Temp\ipykernel_27316\1706572809.py:2: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Time_Category', y='Delay', data=df,

plt.figure(figsize=(10, 6))
sns.displot(df['Distance'])
plt.title('Distribution of Flight Distances')
plt.xlabel('Distance')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

<Figure size 1000x600 with 0 Axes>

plt.figure(figsize=(8, 5))
sns.boxplot(x=df['Time'], color='mediumaquamarine')
plt.title('Box Plot of Flight Times')
plt.xlabel('Time')
plt.tight_layout()
plt.show()

flights = df['Airline'].value_counts()
top_5_airlines = df['Airline'].value_counts().head(5)
plt.figure(figsize=(6, 6))
plt.barh(y=top_5_airlines.index, width=top_5_airlines.values, color=sns.color_palette('Set2', 5))
plt.title('Top 5 Airlines by Number of Flights (using plt.barh)')
plt.xlabel('Number of Flights')
plt.ylabel('Airline')
plt.gca().invert_yaxis() 
plt.show()

plt.figure(figsize=(10, 6))
sns.barplot(x='Airline', y='Delay', data=df,hue = 'Airline',order=df.groupby('Airline')['Delay'].mean()
            .sort_values(ascending=False).index, palette='viridis',legend=True)
plt.title('Average Delay (Minutes) per Airline')
plt.xlabel('Airline')
plt.ylabel('Average Delay (Minutes)')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 3. Data Visualization Updates
plt.figure(figsize=(12, 5))

# Subplot 1: Volume of Flights
plt.subplot(1, 2, 1)
sns.countplot(x='Time_Category', data=df, order=['Morning', 'Afternoon', 'Evening', 'Night'], palette='viridis')
plt.title('Flight Volume per Category')

# Subplot 2: Probability of Delay
plt.subplot(1, 2, 2)
sns.barplot(x='Time_Category', y='Delay', data=df, order=['Morning', 'Afternoon', 'Evening', 'Night'], palette='magma')
plt.title('Delay Probability per Category')

plt.tight_layout()
plt.show()

C:\Users\amitm\AppData\Local\Temp\ipykernel_27316\657825957.py:6: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x='Time_Category', data=df, order=['Morning', 'Afternoon', 'Evening', 'Night'], palette='viridis')
C:\Users\amitm\AppData\Local\Temp\ipykernel_27316\657825957.py:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Time_Category', y='Delay', data=df, order=['Morning', 'Afternoon', 'Evening', 'Night'], palette='magma')

sns.set_style("whitegrid")
plt.figure(figsize=(12, 6))
top_origins = df['Origin Airport'].value_counts().nlargest(10)
sns.barplot(
    x=top_origins.values, 
    y=top_origins.index, 
    palette='rocket', 
    hue=top_origins.index, 
    legend=False
)
plt.title('Top 10 Origin Airports', fontsize=15, fontweight='bold')
plt.xlabel('Number of Flights', fontsize=12)
plt.ylabel('Origin Airport', fontsize=12)
plt.show()

sns.set_style("whitegrid")
plt.figure(figsize=(12, 6))
top_destinations = df['Destination Airport'].value_counts().nlargest(10)
sns.barplot(
    x=top_destinations.values, 
    y=top_destinations.index, 
    palette='mako', 
    hue=top_destinations.index, 
    legend=False
)
plt.title('Top 10 Destination Airports', fontsize=15, fontweight='bold')
plt.xlabel('Number of Flights', fontsize=12)
plt.ylabel('Destination Airport', fontsize=12)
plt.show()

plt.figure(figsize=(10, 6))
sns.histplot(df['Distance'], bins=30, kde=True, color='teal')
plt.title('Distribution of Flight Distances')
plt.xlabel('Distance')
plt.ylabel('Frequency')
plt.show()

plt.figure(figsize=(12, 6))

# Use palette='Set2' or 'Pastel1' so the background is light enough for your text to show
sns.boxplot(x='Airline', y='Distance', data=df, palette='Set2', hue='Airline', legend=False)

plt.title('Distance Distribution by Airline (with Statistical Annotations)', fontsize=14, fontweight='bold')
plt.xlabel('Airline', fontsize=12)
plt.ylabel('Distance', fontsize=12)
plt.xticks(rotation=45)

# --- Your Calculation Logic (Kept Exactly the Same) ---
stats = df.groupby('Airline')['Distance'].quantile([0.25, 0.5, 0.75]).unstack()
stats.columns = ['Q1', 'Median', 'Q3']
stats['IQR'] = stats['Q3'] - stats['Q1']

for i, airline in enumerate(stats.index):
    q1 = stats.loc[airline, 'Q1']
    q3 = stats.loc[airline, 'Q3']
    iqr = stats.loc[airline, 'IQR']
    median = stats.loc[airline, 'Median']

    # Text annotations
    plt.text(i - 0.2, q1, f'Q1: {q1:.0f}', horizontalalignment='center', color='blue', weight='bold', fontsize=8)
    plt.text(i + 0.2, q3, f'Q3: {q3:.0f}', horizontalalignment='center', color='green', weight='bold', fontsize=8)
    plt.text(i, q3 + (q3 - q1) * 0.1, f'IQR: {iqr:.0f}', horizontalalignment='center', color='red', weight='bold', fontsize=8)
    plt.text(i, median, f'{median:.0f}', horizontalalignment='center', color='purple', weight='bold', fontsize=9) # Adjusted median placement slightly

plt.tight_layout()
plt.show()

airline_counts = df['Airline'].value_counts()
plt.figure(figsize=(8, 8))
plt.pie(airline_counts.values, labels=airline_counts.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette('Set3'))
plt.title('Airline Market Share')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()

sns.pairplot(df[['Distance', 'Time']])  # Add more numeric columns if available
plt.suptitle('Pairplot of Numerical Features')
plt.show()

max_time_flight = df[df['Time'] == df['Time'].max()]
min_time_flight = df[df['Time'] == df['Time'].min()]
print("Longest Flight:\n", max_time_flight)
print("Shortest Flight:\n", min_time_flight)

Longest Flight:
        Airline  Flight Origin Airport Destination Airport  Time  Distance  \
17609       B6     739            JFK                 PSE  1439       223   
17610       FL     328            SFO                 ATL  1439       270   
17611       UA      78            HNL                 SFO  1439       313   
35654       B6     480            LAX                 BOS  1439       321   
35655       B6     717            JFK                 SJU  1439       220   
...        ...     ...            ...                 ...   ...       ...   
539378      CO     178            OGG                 SNA  1439       326   
539379      FL     398            SEA                 ATL  1439       305   
539380      FL     609            SFO                 MKE  1439       255   
539381      UA      78            HNL                 SFO  1439       313   
539382      US    1442            LAX                 PHL  1439       301   

        Delay Day of Week Time_Category    Route  
17609       0   Wednesday       Evening  JFK-PSE  
17610       0   Wednesday       Evening  SFO-ATL  
17611       0   Wednesday       Evening  HNL-SFO  
35654       0    Thursday       Evening  LAX-BOS  
35655       0    Thursday       Evening  JFK-SJU  
...       ...         ...           ...      ...  
539378      0      Friday       Evening  OGG-SNA  
539379      0      Friday       Evening  SEA-ATL  
539380      0      Friday       Evening  SFO-MKE  
539381      1      Friday       Evening  HNL-SFO  
539382      1      Friday       Evening  LAX-PHL  

[260 rows x 10 columns]
Shortest Flight:
        Airline  Flight Origin Airport Destination Airport  Time  Distance  \
17612       DL    2344            LAS                 CVG    10       215   
35659       DL    2344            LAS                 CVG    10       215   
53799       DL    2344            LAS                 CVG    10       215   
85189       DL    2344            LAS                 CVG    10       215   
137920      DL    2344            LAS                 CVG    10       215   
155976      DL    2344            LAS                 CVG    10       215   
174115      DL    2344            LAS                 CVG    10       215   
205551      DL    2344            LAS                 CVG    10       215   
258373      DL    2344            LAS                 CVG    10       215   
258374      DL    2687            ANC                 SLC    10       285   
276642      DL    2344            LAS                 CVG    10       215   
295227      DL    2344            LAS                 CVG    10       215   
328765      DL    2344            LAS                 CVG    10       215   
383939      DL    2344            LAS                 CVG    10       215   
402506      DL    2344            LAS                 CVG    10       215   
449998      DL    2344            LAS                 CVG    10       215   
468523      DL    2344            LAS                 CVG    10       215   
487025      DL    2344            LAS                 CVG    10       215   
505514      DL    2344            LAS                 CVG    10       215   
524020      DL    2344            LAS                 CVG    10       215   

        Delay Day of Week Time_Category    Route  
17612       0    Thursday         Night  LAS-CVG  
35659       0      Friday         Night  LAS-CVG  
53799       0    Saturday         Night  LAS-CVG  
85189       0      Monday         Night  LAS-CVG  
137920      0    Thursday         Night  LAS-CVG  
155976      0      Friday         Night  LAS-CVG  
174115      0    Saturday         Night  LAS-CVG  
205551      0      Monday         Night  LAS-CVG  
258373      1    Thursday         Night  LAS-CVG  
258374      0    Thursday         Night  ANC-SLC  
276642      0      Friday         Night  LAS-CVG  
295227      0    Saturday         Night  LAS-CVG  
328765      0      Monday         Night  LAS-CVG  
383939      1    Thursday         Night  LAS-CVG  
402506      0      Friday         Night  LAS-CVG  
449998      0      Monday         Night  LAS-CVG  
468523      0     Tuesday         Night  LAS-CVG  
487025      0   Wednesday         Night  LAS-CVG  
505514      0    Thursday         Night  LAS-CVG  
524020      1      Friday         Night  LAS-CVG

busiest_day = df['Day of Week'].value_counts()
print("Flights per Day:\n", busiest_day)

Flights per Day:
 Day of Week
Thursday     91445
Wednesday    89746
Friday       85248
Monday       72769
Tuesday      71340
Sunday       69879
Saturday     58956
Name: count, dtype: int64

plt.figure(figsize=(10, 6))
sns.countplot(x='Day of Week', data=df, order=busiest_day.index, palette='viridis', hue='Day of Week', legend=False)
plt.title("Flights per Day (Volume)", fontsize=14, fontweight='bold')
plt.xlabel("Day of the Week", fontsize=12)
plt.ylabel("Number of Flights", fontsize=12)
plt.show()

# 3. Data Visualization Updates
plt.figure(figsize=(10, 5))
sns.barplot(x='Time_Category', y='Delay', data=df, 
            order=['Morning', 'Afternoon', 'Evening', 'Night'], palette='magma')
plt.title('Probability of Delay by Time Category')
plt.ylabel('Mean Delay Rate')
plt.show()

C:\Users\amitm\AppData\Local\Temp\ipykernel_27316\890531699.py:3: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Time_Category', y='Delay', data=df,

most_repeated_route = df['Route'].value_counts().head()
print("Most Repeated Routes:\n", most_repeated_route)

Most Repeated Routes:
 Route
LAX-SFO    1079
SFO-LAX    1077
OGG-HNL     982
HNL-OGG     951
SAN-LAX     935
Name: count, dtype: int64

day_mapping = {
    'Monday': 1,
    'Tuesday': 2,
    'Wednesday': 3,
    'Thursday': 4,
    'Friday': 5,
    'Saturday': 6,
    'Sunday': 7
}

df['Day of Week (Num)'] = df['Day of Week'].map(day_mapping)
# Compute correlation matrix
corr_matrix = df[['Day of Week (Num)', 'Time', 'Distance', 'Delay']].corr()

# Plot the heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap of Numerical Features')
plt.show()

# Sample to reduce rendering time
df_sample = df.sample(5000, random_state=1)

# Plot pairplot
sns.pairplot(df_sample[['Day of Week', 'Time', 'Distance', 'Delay']])
plt.suptitle('Pairwise Plots of Numerical Variables', y=1.02)
plt.show()

import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("whitegrid")
plt.figure(figsize=(12, 6))
sns.barplot(x='Airline', y='Delay', data=df, estimator='mean', errorbar=None, palette='viridis', hue='Airline', legend=False)
plt.title("Percentage of Flights Delayed by Airline", fontsize=15, fontweight='bold', color='#333333')
plt.xlabel("Airline Carrier", fontsize=12)
plt.ylabel("Delay Percentage", fontsize=12)
plt.xticks(rotation=45) 
plt.show()

# Create delay categories
df['DelayCategory'] = pd.cut(df['Delay'], bins=[-1, 0, 15, 60, 300], 
                             labels=['No Delay', 'Short', 'Medium', 'Long'])

# Cross-tabulation
delay_counts = pd.crosstab(df['Airline'], df['DelayCategory'])

# Plot stacked bar chart
delay_counts.plot(kind='bar', stacked=True, figsize=(12, 6), colormap='Set2')
plt.title('Delay Category Distribution per Airline')
plt.ylabel('Number of Flights')
plt.xticks(rotation=45)
plt.show()

plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='Airline', y='Delay', palette='Set2')
plt.title('Flight Delay by Airline (ANOVA Visual)')
plt.ylabel("Delay (minutes)")
plt.xlabel("Airline")
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

C:\Users\amitm\AppData\Local\Temp\ipykernel_27316\2524743842.py:2: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=df, x='Airline', y='Delay', palette='Set2')

# Ensure correct day order
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

plt.figure(figsize=(10, 5))
sns.boxplot(data=df, x='Day of Week', y='Delay', order=day_order, palette='Set3')
plt.title('Flight Delay by Day of Week (ANOVA Visual)')
plt.ylabel("Delay (minutes)")
plt.xlabel("Day of Week")
plt.grid(True)
plt.show()

C:\Users\amitm\AppData\Local\Temp\ipykernel_27316\3262601972.py:5: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=df, x='Day of Week', y='Delay', order=day_order, palette='Set3')

# Airline mean and SEM(Standard Error mean)
airline_means = df.groupby('Airline')['Delay'].mean()
airline_sems = df.groupby('Airline')['Delay'].sem()

plt.figure(figsize=(12, 6))
plt.errorbar(airline_means.index, airline_means.values, yerr=airline_sems.values,
             fmt='o-', capsize=5, color='teal', ecolor='orange', linewidth=2)
plt.title('Mean Delay by Airline (± SEM)')
plt.xlabel('Airline')
plt.ylabel('Mean Delay (minutes)')
plt.grid(True)
plt.xticks(rotation=45)
plt.show()

# Use categorical columns
categorical_cols = ['Airline', 'Origin Airport', 'Destination Airport', 'Day of Week', 'Time_Category']
# Create transactions
transactions = []
for _, row in df[categorical_cols].iterrows():
    transaction = [f"{col}={row[col]}" for col in categorical_cols]
    transactions.append(transaction)

# Encode transactions
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_trans = pd.DataFrame(te_ary, columns=te.columns_)

# Run FP-Growth
frequent_itemsets = fpgrowth(df_trans, min_support=0.01, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.1)

# Display results
print("Frequent Itemsets:")
display(frequent_itemsets.sort_values(by='support', ascending=False).head())

print("\nAssociation Rules:")
display(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].sort_values(by='lift', ascending=False).head())

Frequent Itemsets:

Association Rules:

top_rules = rules.sort_values(by='lift', ascending=False).head(10)

top_rules['support'] = top_rules['support'].round(3)
top_rules['confidence'] = top_rules['confidence'].round(3)
top_rules['lift'] = top_rules['lift'].round(3)

# Display just the relevant columns
display(top_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

import networkx as nx
import matplotlib.pyplot as plt

# Create a graph
G = nx.DiGraph()

# Add edges from your top 10 or 20 rules
for i, row in top_rules.iterrows():
    # Convert frozen sets to strings for the graph labels
    start = list(row['antecedents'])[0]
    end = list(row['consequents'])[0]
    weight = row['lift']
    
    G.add_edge(start, end, weight=weight)

# Draw it
plt.figure(figsize=(12, 8))
pos = nx.spring_layout(G, k=1)  # Spread nodes out
nx.draw(G, pos, with_labels=True, 
        node_color='skyblue', 
        node_size=2500, 
        edge_color='red',  # Red lines for "Delay" connections
        width=2, 
        font_size=10, 
        font_weight='bold',
        arrowsize=20)

plt.title("Network of Flight Delays (Strongest Associations)", fontsize=15)
plt.show()

	id	Flight	DayOfWeek	Time	Length	Delay
count	539383.000000	539383.000000	539383.000000	539383.000000	539383.000000	539383.000000
mean	269692.000000	2427.928630	3.929668	802.728963	132.202007	0.445442
std	155706.604461	2067.429837	1.914664	278.045911	70.117016	0.497015
min	1.000000	1.000000	1.000000	10.000000	0.000000	0.000000
25%	134846.500000	712.000000	2.000000	565.000000	81.000000	0.000000
50%	269692.000000	1809.000000	4.000000	795.000000	115.000000	0.000000
75%	404537.500000	3745.000000	5.000000	1035.000000	162.000000	1.000000
max	539383.000000	7814.000000	7.000000	1439.000000	655.000000	1.000000

	support	itemsets
61	0.404998	(Time_Category=Morning)
79	0.383642	(Time_Category=Afternoon)
80	0.199910	(Time_Category=Evening)
74	0.174453	(Airline=WN)
81	0.169536	(Day of Week=Thursday)

	antecedents	consequents	support	confidence	lift
172	(Airline=CO)	(Origin Airport=IAH)	0.011684	0.298418	10.173934
171	(Origin Airport=IAH)	(Airline=CO)	0.011684	0.398331	10.173934
16	(Destination Airport=IAH)	(Airline=CO)	0.011682	0.398318	10.173606
17	(Airline=CO)	(Destination Airport=IAH)	0.011682	0.298371	10.173606
182	(Origin Airport=CLT)	(Airline=US)	0.013150	0.637115	9.960839

	antecedents	consequents	support	confidence	lift
172	(Airline=CO)	(Origin Airport=IAH)	0.012	0.298	10.174
171	(Origin Airport=IAH)	(Airline=CO)	0.012	0.398	10.174
16	(Destination Airport=IAH)	(Airline=CO)	0.012	0.398	10.174
17	(Airline=CO)	(Destination Airport=IAH)	0.012	0.298	10.174
182	(Origin Airport=CLT)	(Airline=US)	0.013	0.637	9.961
183	(Airline=US)	(Origin Airport=CLT)	0.013	0.206	9.961
30	(Airline=US)	(Destination Airport=CLT)	0.013	0.206	9.958
31	(Destination Airport=CLT)	(Airline=US)	0.013	0.637	9.958
18	(Destination Airport=IAH)	(Airline=XE)	0.015	0.502	8.691
19	(Airline=XE)	(Destination Airport=IAH)	0.015	0.255	8.691

	id	Airline	Flight	AirportFrom	AirportTo	DayOfWeek	Time	Length	Delay
0	1	CO	269	SFO	IAH	3	15	205	1
1	2	US	1558	PHX	CLT	3	15	222	1
2	3	AA	2400	LAX	DFW	3	20	165	1
3	4	AA	2466	SFO	DFW	3	20	195	1
4	5	AS	108	ANC	SEA	3	30	202	0
...	...	...	...	...	...	...	...	...	...
539378	539379	CO	178	OGG	SNA	5	1439	326	0
539379	539380	FL	398	SEA	ATL	5	1439	305	0
539380	539381	FL	609	SFO	MKE	5	1439	255	0
539381	539382	UA	78	HNL	SFO	5	1439	313	1
539382	539383	US	1442	LAX	PHL	5	1439	301	1

	id	Airline	Flight	AirportFrom	AirportTo	DayOfWeek	Time	Length	Delay
243249	243250	AA	726	ELP	DFW	3	465	105	0
514772	514773	DL	78	MCO	JFK	4	805	160	0
449906	449907	XE	2293	IAH	CRP	7	1380	55	0
463631	463632	XE	3082	SDF	CLE	1	1034	70	1
307162	307163	AA	1372	MIA	RDU	6	1015	125	1

Importing packages¶

Data Collection¶

Reading the dataset¶

First five rows¶

Last five rows¶

Random five rows¶

Information about dataset¶

Statistical Analysis of Raw Features¶

Mean of values¶

Median of values¶

Mode of Values¶

Maximum value¶

Minimum value¶

Percentiles¶

Top 25 percentile¶

Top 50 percentile¶

Variance and Standard Deviation¶

Data Preprocessing¶

Null values¶

Duplicate values¶

Feature Engineering¶

Changing DayOfWeek column to names¶

Categorizing Flight Times¶

Changing 'AirportFrom' column to 'Origin Airport'¶

Changing 'AirportTo' column to 'Destination Airport'¶

Changing 'Length' to 'Distance'¶

Creating 'Route' column¶

Remove 'id'column as it is unwanted¶

Finding unique values in columns¶

Statistical Analysis of Engineered Feature¶

Data Visualization¶

Correlation Heatmap¶

Univariate analysis¶

Categorical Variables¶

Count plot¶

Pie Chart¶

Numerical Variable¶

Barplot¶

Displot¶

Box Plot¶

Bivariate Analysis¶

Bar Chart¶

Barplot¶

Visualizing Delays by Time Category¶

Histogram¶

Boxplot¶

PieChart¶

Pairplot¶

Visualizing the Impact of Time on Delays¶

Multi-Variate analysis¶

Corelation Heatmap¶

Pairplot¶

Barplot: Delay by Airline¶

Stacked Bar plot: Airline Vs Delay Category¶

Boxplot: Delay by Airline¶

Boxplot: Delay by Day of the Week¶

Mean plot with Error Bars (Mean ± SEM)¶

Mining Association Rules (FP-Growth)¶

Why Lift?¶

Visualizing Delay Patterns (Network Graph)¶

Graph Legend¶