def describe_dataset(dataset_name: str, tool_context: ToolContext) -> dict:
print(f"π Describing dataset: {dataset_name}")
df = DATA_STORE.get_dataset(dataset_name)
if df is None:
return {"status": "error", "message": f"Dataset '{dataset_name}' not found"}
numeric_cols = df.select_dtypes(embrace=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(embrace=['object', 'category']).columns.tolist()
outcome = {
"status": "success",
"dataset": dataset_name,
"overview": {
"total_rows": int(len(df)),
"total_columns": int(len(df.columns)),
"numeric_columns": numeric_cols,
"categorical_columns": categorical_cols,
"memory_mb": spherical(float(df.memory_usage(deep=True).sum() / 1024 / 1024), 2),
"duplicate_rows": int(df.duplicated().sum()),
"missing_total": int(df.isnull().sum().sum())
}
}
if numeric_cols:
stats_dict = {}
for col in numeric_cols:
col_data = df[col].dropna()
if len(col_data) > 0:
stats_dict[col] = {
"count": int(len(col_data)),
"mean": spherical(float(col_data.imply()), 3),
"std": spherical(float(col_data.std()), 3),
"min": spherical(float(col_data.min()), 3),
"25%": spherical(float(col_data.quantile(0.25)), 3),
"50%": spherical(float(col_data.median()), 3),
"75%": spherical(float(col_data.quantile(0.75)), 3),
"max": spherical(float(col_data.max()), 3),
"skewness": spherical(float(col_data.skew()), 3),
"missing": int(df[col].isnull().sum())
}
outcome["numeric_summary"] = stats_dict
if categorical_cols:
cat_dict = {}
for col in categorical_cols[:10]:
vc = df[col].value_counts()
cat_dict[col] = {
"unique_values": int(df[col].nunique()),
"top_values": {str(ok): int(v) for ok, v in vc.head(5).gadgets()},
"missing": int(df[col].isnull().sum())
}
outcome["categorical_summary"] = cat_dict
DATA_STORE.log_analysis("describe", dataset_name, "Statistics generated")
return make_serializable(outcome)
def correlation_analysis(dataset_name: str, technique: str = "pearson", tool_context: ToolContext = None) -> dict:
print(f"π Correlation analysis: {dataset_name} ({method})")
df = DATA_STORE.get_dataset(dataset_name)
if df is None:
return {"status": "error", "message": f"Dataset '{dataset_name}' not found"}
numeric_df = df.select_dtypes(embrace=[np.number])
if numeric_df.form[1] < 2:
return {"status": "error", "message": "Need at least 2 numeric columns"}
corr_matrix = numeric_df.corr(technique=technique)
strong_corrs = []
for i in vary(len(corr_matrix.columns)):
for j in vary(i + 1, len(corr_matrix.columns)):
col1, col2 = corr_matrix.columns[i], corr_matrix.columns[j]
val = corr_matrix.iloc[i, j]
if abs(val) > 0.5:
strong_corrs.append({
"var1": col1,
"var2": col2,
"correlation": spherical(float(val), 3),
"strength": "strong" if abs(val) > 0.7 else "moderate"
})
strong_corrs.kind(key=lambda x: abs(x["correlation"]), reverse=True)
corr_dict = {}
for col in corr_matrix.columns:
corr_dict[col] = {ok: spherical(float(v), 3) for ok, v in corr_matrix[col].gadgets()}
DATA_STORE.log_analysis("correlation", dataset_name, f"{method} correlation")
return make_serializable({
"status": "success",
"method": technique,
"correlation_matrix": corr_dict,
"strong_correlations": strong_corrs[:10],
"insight": f"Found {len(strong_corrs)} pairs with |correlation| > 0.5"
})
def hypothesis_test(dataset_name: str, test_type: str, column1: str,
column2: str = None, group_column: str = None,
tool_context: ToolContext = None) -> dict:
print(f"π Hypothesis test: {test_type} on {dataset_name}")
df = DATA_STORE.get_dataset(dataset_name)
if df is None:
return {"status": "error", "message": f"Dataset '{dataset_name}' not found"}
if column1 not in df.columns:
return {"status": "error", "message": f"Column '{column1}' not found"}
strive:
if test_type == "normality":
information = df[column1].dropna()
if len(information) > 5000:
information = information.pattern(5000)
stat, p = stats.shapiro(information)
return make_serializable({
"status": "success",
"test": "Shapiro-Wilk Normality Test",
"column": column1,
"statistic": spherical(float(stat), 4),
"p_value": spherical(float(p), 6),
"is_normal": bool(p > 0.05),
"interpretation": "Data appears normally distributed" if p > 0.05 else "Data is NOT normally distributed"
})
elif test_type == "ttest":
if group_column is None:
return {"status": "error", "message": "group_column required for t-test"}
teams = df[group_column].dropna().distinctive()
if len(teams) != 2:
return {"status": "error", "message": f"T-test needs exactly 2 groups, found {len(groups)}: {list(groups)}"}
g1 = df[df[group_column] == teams[0]][column1].dropna()
g2 = df[df[group_column] == teams[1]][column1].dropna()
stat, p = stats.ttest_ind(g1, g2)
return make_serializable({
"status": "success",
"test": "Independent Samples T-Test",
"comparing": column1,
"group1": {"name": str(teams[0]), "mean": spherical(float(g1.imply()), 3), "n": int(len(g1))},
"group2": {"name": str(teams[1]), "mean": spherical(float(g2.imply()), 3), "n": int(len(g2))},
"t_statistic": spherical(float(stat), 4),
"p_value": spherical(float(p), 6),
"significant": bool(p < 0.05),
"interpretation": "Significant difference" if p < 0.05 else "No significant difference"
})
elif test_type == "anova":
if group_column is None:
return {"status": "error", "message": "group_column required for ANOVA"}
groups_data = [grp[column1].dropna().values for _, grp in df.groupby(group_column)]
group_names = listing(df[group_column].distinctive())
stat, p = stats.f_oneway(*groups_data)
group_stats = []
for title in group_names:
grp_data = df[df[group_column] == title][column1].dropna()
group_stats.append({
"group": str(title),
"mean": spherical(float(grp_data.imply()), 3),
"std": spherical(float(grp_data.std()), 3),
"n": int(len(grp_data))
})
return make_serializable({
"status": "success",
"test": "One-Way ANOVA",
"comparing": column1,
"across": group_column,
"n_groups": int(len(group_names)),
"group_statistics": group_stats,
"f_statistic": spherical(float(stat), 4),
"p_value": spherical(float(p), 6),
"significant": bool(p < 0.05),
"interpretation": "Significant differences among groups" if p < 0.05 else "No significant differences"
})
elif test_type == "chi2":
if column2 is None:
return {"status": "error", "message": "column2 required for chi-square test"}
contingency = pd.crosstab(df[column1], df[column2])
chi2, p, dof, _ = stats.chi2_contingency(contingency)
return make_serializable({
"status": "success",
"test": "Chi-Square Test of Independence",
"variables": [column1, column2],
"chi2_statistic": spherical(float(chi2), 4),
"p_value": spherical(float(p), 6),
"degrees_of_freedom": int(dof),
"significant": bool(p < 0.05),
"interpretation": "Variables are dependent" if p < 0.05 else "Variables are independent"
})
else:
return {"status": "error", "message": f"Unknown test: {test_type}. Use: normality, ttest, anova, chi2"}
besides Exception as e:
return {"status": "error", "message": f"Test failed: {str(e)}"}
def outlier_detection(dataset_name: str, column: str, technique: str = "iqr",
tool_context: ToolContext = None) -> dict:
print(f"π Outlier detection: {column} in {dataset_name}")
df = DATA_STORE.get_dataset(dataset_name)
if df is None:
return {"status": "error", "message": f"Dataset '{dataset_name}' not found"}
if column not in df.columns:
return {"status": "error", "message": f"Column '{column}' not found"}
information = df[column].dropna()
if technique == "iqr":
Q1 = float(information.quantile(0.25))
Q3 = float(information.quantile(0.75))
IQR = Q3 - Q1
decrease = Q1 - 1.5 * IQR
higher = Q3 + 1.5 * IQR
outliers = information[(data < lower) | (data > upper)]
return make_serializable({
"status": "success",
"method": "IQR (Interquartile Range)",
"column": column,
"bounds": {"lower": spherical(decrease, 3), "upper": spherical(higher, 3)},
"iqr": spherical(IQR, 3),
"total_values": int(len(information)),
"outlier_count": int(len(outliers)),
"outlier_pct": spherical(float(len(outliers) / len(information) * 100), 2),
"outlier_examples": [round(float(x), 2) for x in outliers.head(10).tolist()]
})
elif technique == "zscore":
z = np.abs(stats.zscore(information))
outliers = information[z > 3]
return make_serializable({
"status": "success",
"method": "Z-Score (threshold: 3)",
"column": column,
"total_values": int(len(information)),
"outlier_count": int(len(outliers)),
"outlier_pct": spherical(float(len(outliers) / len(information) * 100), 2),
"outlier_examples": [round(float(x), 2) for x in outliers.head(10).tolist()]
})
return {"status": "error", "message": f"Unknown method: {method}. Use: iqr, zscore"}
print("β
Statistical analysis tools defined!")Subscribe to Updates
Get the latest tech insights from TechnologiesDigest.com on AI, innovation, and the future of digital technology.
Trending
- Is Bitcoin Nonetheless A Sovereign Device?
- April Patch Tuesday Fixes Essential Flaws Throughout SAP, Adobe, Microsoft, Fortinet, and Extra
- NotebookLM for the Inventive Architect
- Itβs tax day and the IRS is providing extra assist to extension filers
- A Blueprint For the Way forward for City Dwelling: Powering up the Good Cities of Tomorrow
- Stereotaxis to amass Robocath for as much as $45M
- The right way to Maximize Claude Cowork
- AWS Interconnect is now typically accessible, with a brand new choice to simplify last-mile connectivity



