Sep-07-2022, 09:42 AM
May I suggest spacing code out a bit to make it easier to read.
# Create a pipeline that extracts features from the data then creates (evaluates) a model import pandas as pd import numpy as np from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score from sklearn.pipeline import Pipeline from sklearn.pipeline import FeatureUnion from sklearn.linear_model import LogisticRegression from sklearn.decomposition import PCA from sklearn.feature_selection import SelectKBest filename = "pima-indians-diabetes.data.csv" names = ["preg", "plas", "pres", "skin", "test", "mass", "pedi", "age", "class"] dataframe = pd.read_csv(filename, names=names) array = dataframe.values X = array[:, 0:8] y = array[:, 8] features = [] features.append(("pca", PCA(n_components=3))) features.append(("select_best", SelectKBest(k=6))) feature_union = FeatureUnion(features) # create pipeline estimators = [] estimators.append(("feature_union", feature_union)) estimators.append(("logistic", LogisticRegression(solver="newton-cg"))) model = Pipeline(estimators) # evaluate pipeline seed = 7 kfold = KFold(n_splits=10, shuffle=True, random_state=seed) results = cross_val_score(model, X, y, cv=kfold) print(results.mean())