Skip to content
Snippets Groups Projects
Commit 3dc5753b authored by Yann Audin's avatar Yann Audin
Browse files

2024-03-25c

parent 767a79d9
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id: tags:
``` python
import pandas as pd
path = r"C:\Users\audin\gitlab\anthalgo\DONNEES\DataMed\similarities\fra_6_simple_copy.csv"
df = pd.read_csv(path, index_col=0, sep=",", decimal=".")
df = df[df['code_1'] != df['code_2']].reset_index(drop=True)
parameters = ["cos_bow_fra_rsw_lwc_ste_ra", "gram_2_fra_lwc_ste", "jac_bow_fra_rp_lwc_ste_ra",
"lev_fra_rsw_rp_lwc", "tf_idf_fra_rsw_ste"]
names = ["code_1", "code_2"]
print(df.columns)
```
%% Output
Index(['fragment_1', 'code_1', 'fragment_2', 'code_2', 'P', 'D', 'R', 'S',
'cos_bow_fra_rsw_lwc_ste_ra', 'gram_2_fra_lwc_ste',
'jac_bow_fra_rp_lwc_ste_ra', 'lev_fra_rsw_rp_lwc',
'tf_idf_fra_rsw_ste'],
dtype='object')
%% Cell type:code id: tags:
``` python
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, balanced_accuracy_score
def logistic_regression_analysis(X_train, X_test, y_train, y_test, feature_names):
# Initialize and train logistic regression model
logistic_reg = LogisticRegression(max_iter = 100000, random_state = 20)
logistic_reg.fit(X_train, y_train)
# Predict labels on the test set
y_pred = logistic_reg.predict(X_test)
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
counter = 0
for i in range(0, len(df)):
if y_pred[i] == 1:
if y_train[i] == 0:
print(f"False positive: {df.code_1[i]}, {df.code_2[i]}")
counter = counter + 1
print("Number of False positives :" + str(counter))
counter = 0
for i in range(0, len(df)):
if y_pred[i] == 0:
if y_train[i] == 1:
print(f"False negative: {df.code_1[i]}, {df.code_2[i]}")
counter = counter + 1
print("Number of False negative :" + str(counter))
counter = 0
for i in range(0, len(df)):
if y_pred[i] == 1:
if y_train[i] == 1:
print(f"True positive: {df.code_1[i]}, {df.code_2[i]}")
counter = counter + 1
print("Number of True positive :" + str(counter))
# Compute classification report
cls_report = classification_report(y_test, y_pred)
# Compute balanced accuracy
bal_accuracy = balanced_accuracy_score(y_test, y_pred)
# Print feature importance
print("Feature Importance:")
for feature, importance in zip(feature_names, logistic_reg.coef_[0]):
print(f"{feature}: {importance}")
# Print the bias (intercept term)
print("Bias (Intercept):", logistic_reg.intercept_)
return logistic_reg, cm, cls_report, bal_accuracy
# Defining the feature columns and the target column
features = ['cos_bow_fra_rsw_lwc_ste_ra', 'gram_2_fra_lwc_ste', 'jac_bow_fra_rp_lwc_ste_ra', 'lev_fra_rsw_rp_lwc', 'tf_idf_fra_rsw_ste']
target = 'S'
# Splitting the data into features and target
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test, feature_names = X, X, y, y, features
# Perform logistic regression analysis
log_reg_model, confusion_mat, class_report, bal_accuracy = logistic_regression_analysis(X_train, X_test, y_train, y_test, feature_names)
# Print results
print("\nConfusion Matrix:")
print(confusion_mat)
print("\nClassification Report:")
print(class_report)
print("\nBalanced Accuracy:", bal_accuracy)
```
%% Output
False positive: b6f7, b6f8
False positive: b6f95, b6f104
False positive: b6f160, b6f247
False positive: b6f200, b6f202
False positive: b6f286, b6f287
Number of False positives :5
False negative: b6f4, b6f5
False negative: b6f4, b6f23
False negative: b6f4, b6f27
False negative: b6f4, b6f28
False negative: b6f4, b6f29
False negative: b6f4, b6f38
False negative: b6f4, b6f90
False negative: b6f4, b6f192
False negative: b6f5, b6f23
False negative: b6f5, b6f27
False negative: b6f5, b6f29
False negative: b6f5, b6f38
False negative: b6f5, b6f90
False negative: b6f5, b6f192
False negative: b6f11, b6f12
False negative: b6f11, b6f13
False negative: b6f11, b6f16
False negative: b6f11, b6f179
False negative: b6f11, b6f181
False negative: b6f11, b6f182
False negative: b6f11, b6f184
False negative: b6f11, b6f186
False negative: b6f11, b6f187
False negative: b6f12, b6f184
False negative: b6f12, b6f187
False negative: b6f13, b6f15
False negative: b6f13, b6f16
False negative: b6f13, b6f180
False negative: b6f13, b6f184
False negative: b6f13, b6f185
False negative: b6f13, b6f187
False negative: b6f14, b6f187
False negative: b6f15, b6f180
False negative: b6f15, b6f186
False negative: b6f15, b6f187
False negative: b6f23, b6f27
False negative: b6f23, b6f28
False negative: b6f23, b6f29
False negative: b6f23, b6f38
False negative: b6f23, b6f90
False negative: b6f27, b6f28
False negative: b6f27, b6f29
False negative: b6f27, b6f38
False negative: b6f27, b6f90
False negative: b6f27, b6f192
False negative: b6f28, b6f38
False negative: b6f28, b6f90
False negative: b6f28, b6f192
False negative: b6f29, b6f38
False negative: b6f29, b6f90
False negative: b6f29, b6f192
False negative: b6f30, b6f38
False negative: b6f47, b6f285
False negative: b6f48, b6f285
False negative: b6f62, b6f63
False negative: b6f62, b6f64
False negative: b6f62, b6f65
False negative: b6f62, b6f67
False negative: b6f62, b6f68
False negative: b6f63, b6f65
False negative: b6f63, b6f66
False negative: b6f63, b6f67
False negative: b6f63, b6f68
False negative: b6f64, b6f66
False negative: b6f64, b6f68
False negative: b6f65, b6f66
False negative: b6f65, b6f68
False negative: b6f66, b6f68
False negative: b6f90, b6f192
False negative: b6f107, b6f296
False negative: b6f179, b6f186
False negative: b6f180, b6f181
False negative: b6f180, b6f184
False negative: b6f180, b6f186
False negative: b6f181, b6f187
False negative: b6f183, b6f186
False negative: b6f184, b6f186
False negative: b6f185, b6f186
False negative: b6f186, b6f187
False negative: b6f190, b6f191
False negative: b6f191, b6f300
False negative: b6f202, b6f272
False negative: b6f217, b6f218
Number of False negative :83
Feature Importance:
cos_bow_fra_rsw_lwc_ste_ra: 12.165794242815803
gram_2_fra_lwc_ste: 2.0515359710681134
jac_bow_fra_rp_lwc_ste_ra: 6.298730432380537
lev_fra_rsw_rp_lwc: 1.808642970150861
tf_idf_fra_rsw_ste: 10.314586224379289
Bias (Intercept): [-8.73701199]
Confusion Matrix:
[[63722 5]
[ 83 93]]
Classification Report:
precision recall f1-score support
0 1.00 1.00 1.00 63727
1 0.95 0.53 0.68 176
accuracy 1.00 63903
macro avg 0.97 0.76 0.84 63903
weighted avg 1.00 1.00 1.00 63903
Balanced Accuracy: 0.7641653156147601
%% Cell type:code id: tags:
``` python
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
features = ['cos_bow_fra_rsw_lwc_ste_ra', 'gram_2_fra_lwc_ste', 'jac_bow_fra_rp_lwc_ste_ra', 'lev_fra_rsw_rp_lwc', 'tf_idf_fra_rsw_ste']
features_name = ['Sac de mots - cosinus', '2-grammes - Jaccard', 'Sac de mots - Jaccard', 'Damereau-Levenshtein', 'Sac de mots (pondéré tf-idf) - cosinus']
target = 'Variations stylistiques'
df_coeff = df
for i in range(0, len(features_name)):
df_coeff[features_name[i]] = df_coeff[features[i]]
df_coeff["Variations stylistiques"] = df_coeff["S"]
X = df_coeff[features_name]
y = df_coeff[target]
# Creating a Decision Tree Classifier
clf = DecisionTreeClassifier(max_depth = 3)
# Training the classifier
clf.fit(X, y)
# Making predictions
y_pred = clf.predict(X)
counter = 0
for i in range(0, len(df)):
if y_pred[i] == 1:
if y[i] == 0:
print(f"False positive: {df.code_1[i]}, {df.code_2[i]}")
counter = counter + 1
print("Number of False positives :" + str(counter))
counter = 0
for i in range(0, len(df)):
if y_pred[i] == 0:
if y[i] == 1:
print(f"False negative: {df.code_1[i]}, {df.code_2[i]}")
counter = counter + 1
print("Number of False negative :" + str(counter))
counter = 0
for i in range(0, len(df)):
if y_pred[i] == 1:
if y[i] == 1:
print(f"True positive: {df.code_1[i]}, {df.code_2[i]}")
counter = counter + 1
print("Number of True positive :" + str(counter))
# Compute balanced accuracy
bal_accuracy = balanced_accuracy_score(y, y_pred)
# Evaluating the classifier
accuracy = clf.score(X, y)
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report(y, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y, y_pred))
print("Balanced accuracy:")
print(bal_accuracy)
# Plotting the tree and saving it as an image
#plt.figure(figsize=(20, 10)) # Adjust the size as needed
#plot_tree(clf, filled=True, feature_names=parameters, class_names=["False", "True"])
#plt.savefig("decision_tree.png") # Save to file
#plt.show()
# Adjust the figure size as needed
plt.figure(figsize=(30, 10))
# Plot the decision tree
plot_tree(clf, filled=True, feature_names=features_name, class_names=["False", "True"], fontsize=17)
# Save the tree as a high-resolution image
plt.savefig("decision_tree.png", format='png', dpi=300) # Save to file
# Show the plot
plt.show()
```
%% Output
False positive: b6f1, b6f20
False positive: b6f7, b6f8
False positive: b6f9, b6f326
False positive: b6f12, b6f153
False positive: b6f18, b6f20
False positive: b6f35, b6f106
False positive: b6f36, b6f95
False positive: b6f36, b6f98
False positive: b6f38, b6f90
False positive: b6f39, b6f174
False positive: b6f39, b6f289
False positive: b6f45, b6f169
False positive: b6f46, b6f159
False positive: b6f46, b6f195
False positive: b6f60, b6f156
False positive: b6f62, b6f90
False positive: b6f62, b6f295
False positive: b6f63, b6f295
False positive: b6f69, b6f70
False positive: b6f75, b6f118
False positive: b6f78, b6f177
False positive: b6f86, b6f91
False positive: b6f86, b6f129
False positive: b6f95, b6f104
False positive: b6f101, b6f306
False positive: b6f115, b6f116
False positive: b6f134, b6f208
False positive: b6f153, b6f181
False positive: b6f158, b6f334
False positive: b6f160, b6f247
False positive: b6f189, b6f268
False positive: b6f192, b6f193
False positive: b6f200, b6f202
False positive: b6f217, b6f219
False positive: b6f217, b6f237
False positive: b6f218, b6f219
False positive: b6f218, b6f237
False positive: b6f220, b6f237
False positive: b6f244, b6f323
False positive: b6f268, b6f347
False positive: b6f286, b6f287
False positive: b6f325, b6f328
Number of False positives :42
False negative: b6f4, b6f23
False negative: b6f4, b6f27
False negative: b6f4, b6f29
False negative: b6f4, b6f38
False negative: b6f4, b6f90
False negative: b6f4, b6f192
False negative: b6f5, b6f23
False negative: b6f5, b6f38
False negative: b6f5, b6f192
False negative: b6f11, b6f13
False negative: b6f11, b6f186
False negative: b6f11, b6f187
False negative: b6f12, b6f186
False negative: b6f23, b6f27
False negative: b6f23, b6f28
False negative: b6f23, b6f38
False negative: b6f23, b6f90
False negative: b6f27, b6f90
False negative: b6f27, b6f192
False negative: b6f28, b6f90
False negative: b6f28, b6f192
False negative: b6f29, b6f90
False negative: b6f47, b6f285
False negative: b6f63, b6f65
False negative: b6f63, b6f66
False negative: b6f90, b6f192
False negative: b6f107, b6f296
False negative: b6f185, b6f186
False negative: b6f186, b6f187
False negative: b6f217, b6f218
Number of False negative :30
Accuracy: 0.9988732923336933
Classification Report:
precision recall f1-score support
0 1.00 1.00 1.00 63727
1 0.78 0.83 0.80 176
accuracy 1.00 63903
macro avg 0.89 0.91 0.90 63903
weighted avg 1.00 1.00 1.00 63903
Confusion Matrix:
[[63685 42]
[ 30 146]]
Balanced accuracy:
0.9144431966185305
%% Cell type:code id: tags:
``` python
# from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
parameters = ["cos_bow_fra_rsw_lwc_ste_ra", "gram_2_fra_lwc_ste", "jac_bow_fra_rp_lwc_ste_ra",
"lev_fra_rsw_rp_lwc", "tf_idf_fra_rsw_ste"]
# Assuming df is your DataFrame and parameters is your list of feature column names
X = df[parameters] # Features
y = df['S'] # Target variable
# Standardizing the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Creating and training the Perceptron
perceptron = Perceptron()
perceptron.fit(X, y)
# Predictions
y_pred = perceptron.predict(X)
counter = 0
for i in range(0, len(df)):
if y_pred[i] == 1:
if y[i] == 0:
print(f"False positive: {df.code_1[i]}, {df.code_2[i]}")
counter = counter + 1
print("Number of False positives :" + str(counter))
counter = 0
for i in range(0, len(df)):
if y_pred[i] == 0:
if y[i] == 1:
print(f"False negative: {df.code_1[i]}, {df.code_2[i]}")
counter = counter + 1
print("Number of False negative :" + str(counter))
counter = 0
for i in range(0, len(df)):
if y_pred[i] == 1:
if y[i] == 1:
print(f"True positive: {df.code_1[i]}, {df.code_2[i]}")
counter = counter + 1
print("Number of True positive :" + str(counter))
# Compute balanced accuracy
bal_accuracy = balanced_accuracy_score(y, y_pred)
# Evaluation Metrics
accuracy = accuracy_score(y, y_pred)
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report(y, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y, y_pred))
print("Balanced accuracy:")
print(bal_accuracy)
print("\n")
# Extracting weights and bias
weights = perceptron.coef_[0]
bias = perceptron.intercept_[0]
# Printing the rule in a human-readable way
print("Perceptron Decision Rule:")
for feature, weight in zip(parameters, weights):
print(f"{feature}: {weight:.2f}")
print(f"Bias: {bias:.2f}")
#[[True Negative False Positive]
#[False Negative True Positive]]
```
%% Output
False positive: b6f7, b6f8
False positive: b6f36, b6f95
False positive: b6f39, b6f289
False positive: b6f45, b6f169
False positive: b6f46, b6f159
False positive: b6f78, b6f177
False positive: b6f95, b6f104
False positive: b6f160, b6f247
False positive: b6f200, b6f202
False positive: b6f217, b6f219
False positive: b6f286, b6f287
Number of False positives :11
False negative: b6f4, b6f5
False negative: b6f4, b6f23
False negative: b6f4, b6f27
False negative: b6f4, b6f28
False negative: b6f4, b6f29
False negative: b6f4, b6f38
False negative: b6f4, b6f90
False negative: b6f4, b6f192
False negative: b6f5, b6f23
False negative: b6f5, b6f27
False negative: b6f5, b6f29
False negative: b6f5, b6f38
False negative: b6f5, b6f90
False negative: b6f5, b6f192
False negative: b6f11, b6f12
False negative: b6f11, b6f13
False negative: b6f11, b6f16
False negative: b6f11, b6f179
False negative: b6f11, b6f181
False negative: b6f11, b6f184
False negative: b6f11, b6f186
False negative: b6f11, b6f187
False negative: b6f12, b6f184
False negative: b6f12, b6f187
False negative: b6f13, b6f16
False negative: b6f13, b6f180
False negative: b6f13, b6f187
False negative: b6f14, b6f187
False negative: b6f15, b6f180
False negative: b6f15, b6f186
False negative: b6f15, b6f187
False negative: b6f23, b6f27
False negative: b6f23, b6f28
False negative: b6f23, b6f29
False negative: b6f23, b6f38
False negative: b6f23, b6f90
False negative: b6f27, b6f28
False negative: b6f27, b6f29
False negative: b6f27, b6f38
False negative: b6f27, b6f90
False negative: b6f27, b6f192
False negative: b6f28, b6f38
False negative: b6f28, b6f90
False negative: b6f28, b6f192
False negative: b6f29, b6f38
False negative: b6f29, b6f90
False negative: b6f29, b6f192
False negative: b6f30, b6f38
False negative: b6f47, b6f285
False negative: b6f48, b6f285
False negative: b6f62, b6f63
False negative: b6f62, b6f64
False negative: b6f62, b6f65
False negative: b6f62, b6f67
False negative: b6f62, b6f68
False negative: b6f63, b6f65
False negative: b6f63, b6f66
False negative: b6f63, b6f67
False negative: b6f63, b6f68
False negative: b6f64, b6f66
False negative: b6f64, b6f68
False negative: b6f65, b6f66
False negative: b6f65, b6f68
False negative: b6f66, b6f68
False negative: b6f90, b6f192
False negative: b6f107, b6f296
False negative: b6f180, b6f186
False negative: b6f181, b6f187
False negative: b6f183, b6f186
False negative: b6f184, b6f186
False negative: b6f185, b6f186
False negative: b6f186, b6f187
False negative: b6f190, b6f191
False negative: b6f202, b6f272
False negative: b6f217, b6f218
Number of False negative :75
Accuracy: 0.9986542102874669
Classification Report:
precision recall f1-score support
0 1.00 1.00 1.00 63727
1 0.90 0.57 0.70 176
accuracy 1.00 63903
macro avg 0.95 0.79 0.85 63903
weighted avg 1.00 1.00 1.00 63903
Confusion Matrix:
[[63716 11]
[ 75 101]]
Balanced accuracy:
0.7868455125342904
Perceptron Decision Rule:
cos_bow_fra_rsw_lwc_ste_ra: 2.85
gram_2_fra_lwc_ste: 1.32
jac_bow_fra_rp_lwc_ste_ra: 4.22
lev_fra_rsw_rp_lwc: -2.12
tf_idf_fra_rsw_ste: 4.76
Bias: -3.00
......
File added
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment