LDA更改

发布时间:2026/6/28 2:09:59

LDA更改 #!/usr/bin/env python3 # -*- coding: utf-8 -*- PCA载荷分析 LDA关键因子提取 - 修复NaN版 适配HY和YY两个窑口分组 import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA from sklearn.preprocessing import StandardScaler import warnings warnings.filterwarnings(ignore) # # 1. 读取数据 # LOADING_FILE rD:\开题相关\PCA载荷矩阵.csv XRF_FILE rD:\开题相关\XRF原始数据.csv # 读取载荷矩阵 df_load pd.read_csv(LOADING_FILE, index_col0) pc_names df_load.index.values variance df_load[Variance].values.astype(float) energy_cols [c for c in df_load.columns if c ! Variance] loadings df_load[energy_cols].values.astype(float) # 读取原始XRF数据 df_xrf pd.read_csv(XRF_FILE) print(原始XRF数据列名, df_xrf.columns.tolist()) print(前3行) print(df_xrf.head(3)) # # 2. 自动识别窑口标签列和元素能量列 # kiln_col None for col in df_xrf.columns[1:]: if df_xrf[col].dtype object and df_xrf[col].nunique() 10: kiln_col col break if kiln_col is None: kiln_col df_xrf.columns[1] print(f\n自动识别窑口标签列: {kiln_col}) print(f窑口类别: {df_xrf[kiln_col].unique()}) energy_cols_xrf [c for c in df_xrf.columns if c not in [sample, kiln_col]] print(f元素能量列: {energy_cols_xrf}) X_raw df_xrf[energy_cols_xrf].values kiln_labels df_xrf[kiln_col].values unique_kilns np.unique(kiln_labels) print(f\n样品数: {len(kiln_labels)}) print(f窑口类别: {unique_kilns}) for k in unique_kilns: print(f {k}: {sum(kiln_labels k)}) # # 3. PCA LDA分析修复NaN # scaler StandardScaler() X_scaled scaler.fit_transform(X_raw) # 检查NaN print(f\n原始数据NaN数量: {np.isnan(X_raw).sum()}) print(f标准化后NaN数量: {np.isnan(X_scaled).sum()}) # 用PCA载荷投影得到得分 scores X_scaled loadings.T print(fPCA得分NaN数量: {np.isnan(scores).sum()}) # 修复NaN用0填充 if np.isnan(scores).sum() 0: print(检测到NaN用0填充...) scores np.nan_to_num(scores, nan0.0) # 检查标签 print(f标签NaN数量: {pd.isna(kiln_labels).sum()}) # 删除无效样品 valid_mask ~np.isnan(scores).any(axis1) ~pd.isna(kiln_labels) scores scores[valid_mask] kiln_labels_clean kiln_labels[valid_mask] print(f有效样品数: {len(kiln_labels_clean)}) for k in np.unique(kiln_labels_clean): print(f {k}: {sum(kiln_labels_clean k)}) n_pcs loadings.shape[0] components [str(name) for name in pc_names] cumulative_ratio np.cumsum(variance) plt.rcParams[font.sans-serif] [SimHei, Microsoft YaHei, DejaVu Sans] plt.rcParams[axes.unicode_minus] False # # 图1方差贡献率 # fig, ax plt.subplots(figsize(10, 7)) bars ax.bar(range(1, n_pcs 1), variance * 100, color[#B85C38 if i 3 else #3B5A8C for i in range(n_pcs)], alpha0.8, edgecolorwhite, linewidth0.5) ax.plot(range(1, n_pcs 1), cumulative_ratio * 100, o-, color#2E8B57, linewidth2, markersize6, label累计贡献率) ax.set_xlabel(主成分, fontsize13) ax.set_ylabel(方差贡献率 (%), fontsize13) ax.set_title(各PC方差贡献率, fontweightbold, fontsize15) ax.set_xticks(range(1, n_pcs 1)) ax.set_xticklabels(components, fontsize11) ax.legend(fontsize12) ax.grid(True, alpha0.3) for i, (v, cr) in enumerate(zip(variance, cumulative_ratio)): ax.text(i 1, v * 100 1, f{v * 100:.1f}%, hacenter, fontsize10) if i 2: ax.text(i 1, cr * 100 2, f累计{cr * 100:.1f}%, hacenter, fontsize10, color#2E8B57) plt.tight_layout() plt.savefig(rD:\开题相关\PythonProject\图1_方差贡献率.png, dpi300, bbox_inchestight) plt.show() print(图1已保存) # # 图2LDA判别分析 # fig, axes plt.subplots(1, 2, figsize(16, 7)) # --- 左LDA散点图 --- ax axes[0] n_pc_use min(5, n_pcs) lda LDA(n_components1) lda_scores lda.fit_transform(scores[:, :n_pc_use], kiln_labels_clean) colors_map {HY: #6A5ACD, YY: #2E8B57} markers {HY: s, YY: o} for k in np.unique(kiln_labels_clean): m kiln_labels_clean k ax.scatter(lda_scores[m, 0], np.zeros(m.sum()), ccolors_map.get(k, gray), markermarkers.get(k, o), s120, labelk, edgecolorsw, linewidth0.5, alpha0.8) ax.set_title(LDA判别分析基于PCA得分, fontweightbold, fontsize14) ax.set_xlabel(LD1, fontsize12) ax.set_ylabel((一维判别), fontsize12) ax.legend(fontsize11) ax.grid(True, alpha0.3) # --- 右LDA判别轴在各PC上的权重 --- ax axes[1] lda_weights lda.coef_[0] pc_used components[:n_pc_use] bars ax.barh(range(len(pc_used)), lda_weights, color[#B85C38 if w 0 else #3B5A8C for w in lda_weights], alpha0.8, edgecolorwhite, linewidth0.5) ax.set_yticks(range(len(pc_used))) ax.set_yticklabels(pc_used, fontsize12) ax.set_xlabel(LDA判别权重, fontsize12) ax.set_title(LDA判别轴在各PC上的权重\n(权重越大对产地判别越重要), fontweightbold, fontsize14) ax.axvline(0, colorblack, linewidth0.8) ax.grid(True, alpha0.3, axisx) for i, w in enumerate(lda_weights): ax.text(w 0.01 if w 0 else w - 0.01, i, f{w:.3f}, haleft if w 0 else right, vacenter, fontsize10) plt.tight_layout() plt.savefig(rD:\开题相关\PythonProject\图2_LDA判别分析.png, dpi300, bbox_inchestight) plt.show() print(图2已保存) # # 图3关键元素能量回溯 # fig, ax plt.subplots(figsize(12, 8)) element_contrib np.zeros(len(energy_cols_xrf)) for i in range(n_pc_use): element_contrib lda_weights[i] * loadings[i, :] element_importance pd.DataFrame({ 元素能量: energy_cols_xrf, LDA综合贡献: element_contrib, 绝对贡献: np.abs(element_contrib) }).sort_values(绝对贡献, ascendingTrue) colors [#B85C38 if v 0 else #3B5A8C for v in element_importance[LDA综合贡献]] ax.barh(range(len(element_importance)), element_importance[LDA综合贡献], colorcolors, alpha0.8, edgecolorwhite, linewidth0.5) ax.set_yticks(range(len(element_importance))) ax.set_yticklabels(element_importance[元素能量], fontsize12) ax.set_xlabel(LDA综合贡献度判别权重 × 载荷, fontsize13) ax.set_title(各元素能量对产地判别的综合贡献度\n(基于PCA-LDA三级递进), fontweightbold, fontsize15) ax.axvline(0, colorblack, linewidth0.8) ax.grid(True, alpha0.3, axisx) for i, (val, energy) in enumerate(zip(element_importance[LDA综合贡献], element_importance[元素能量])): offset 0.005 if val 0 else -0.005 align left if val 0 else right ax.text(val offset, i, f{val:.3f}, haalign, vacenter, fontsize10) plt.tight_layout() plt.savefig(rD:\开题相关\PythonProject\图3_关键元素能量.png, dpi300, bbox_inchestight) plt.show() print(图3已保存) # # 输出结果 # print(\n * 60) print(【关键因子提取结果】) print( * 60) print(f\n1. LDA判别轴权重各PC对产地判别的贡献) for pc, w in zip(pc_used, lda_weights): print(f {pc}: {w:.4f} {(关键) if abs(w) 0.3 else }) print(f\n2. 对产地判别贡献最大的元素能量Top 5) for _, row in element_importance.tail(5).iterrows(): direction 正向 if row[LDA综合贡献] 0 else 负向 print(f {row[元素能量]}: {row[LDA综合贡献]:.4f} ({direction})) print(\n * 60)

相关新闻