05.08.2021
Performance von 13 Modellen auf 165 Klassifikationsproblemen
Bild übernommen von: Olson et al. (2018), Data-driven Advice for Applying Machine Learning to Bioinformatics Problems, https://arxiv.org/pdf/1708.05070.pdf
siehe: https://deepmind.com/blog/article/alphago-zero-starting-scratch
Bild: Donarreiskoffer, Wikimedia, 2004, https://commons.wikimedia.org/wiki/File:Go_board.jpg
siehe: https://deepmind.com/blog/article/AlphaFold-Using-AI-for-scientific-discovery
Bild: B1357M, Wikimedia, 2014, https://commons.wikimedia.org/wiki/File:FADD_based_on_pdb_file_2GF5.gif
Take-Home-Message: Vergleiche meist fallabhängig.
Auszug aus: Nguyen et al., 2019, Machine Learning and Deep Learning frameworks and libraries for large-scale data mining: a survey
Anmerkung: Code-Beispiele in R (siehe: https://www.r-project.org/)
Code und Folien auf https://martinzaefferer.de/
data <- read.table("auto-mpg.data",na.strings = "?") names(data) <- c("mpg","cylinders","displacement","horsepower", "weight","acceleration","model_year","origin","car_name") str(data)
## 'data.frame': 398 obs. of 9 variables: ## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ... ## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ... ## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ... ## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ... ## $ weight : num 3504 3693 3436 3433 3449 ... ## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ... ## $ model_year : int 70 70 70 70 70 70 70 70 70 70 ... ## $ origin : int 1 1 1 1 1 1 1 1 1 1 ... ## $ car_name : chr "chevrolet chevelle malibu" "buick skylark 320" "plymouth satellite" "amc rebel sst" ...
## Spalte "car_name" entfernen data$car_name <- NULL ## Zeilen mit fehlenden Werten entfernen data <- data[!is.na(data$horsepower),] ## Seed für Zufallszahlengenerator set.seed(0) ## Aufteilen in Trainings- und Validierungsdaten ## und umwandeln in Datentyp 'matrix' indices <- sample(1:nrow(data), 0.7 * nrow(data)) train_data <- as.matrix(data[indices,-1]) train_label <- data[indices,1] val_data <- as.matrix(data[-indices,-1]) val_label <- data[-indices,1]
install.packages("xgboost")
library(xgboost) ## Zufallszahlengenerator, Seed set.seed(1234) ## Trainieren des xgbooost Modells xgb_model <- xgboost(data=train_data,label=train_label, nrounds = 10, nthread = 1, verbose = 0)
p1 <- xgb.plot.tree(model = xgb_model, trees = 0) p2 <- xgb.plot.tree(model = xgb_model, trees = 1)
## Skalierung der Eingabedaten zu Intervall [0,1] scalef <- function(x) (x-min(x)) / (max(x)-min(x)) ## Keras / tensorflow laden library(keras) ## Seed für tensorflow setzen (Reproduzierbarkeit) tensorflow::set_random_seed(1234) ## Modell initialisieren model <- keras_model_sequential()
## Füge dem Modell Schichten hinzu model %>% layer_dense(units = 20, input_shape = ncol(train_data)) %>% layer_dense(units = 1) ## Kompilieren model %>% compile(loss = 'mse', optimizer = "adam") ## Trainieren histo <- model %>% fit(x=apply(train_data,2,scalef), y=train_label, verbose=0, epochs = 100)
plotly::ggplotly(plot(histo))
## GB RMSE: sqrt(mean((predict(xgb_model,val_data) - val_label)^2))
## [1] 2.851855
## NN RMSE: scores = model %>% evaluate(apply(val_data,2,scalef), val_label, verbose = 0) print(sqrt(scores))
## loss ## 4.856659
siehe: experimentReplication.R
load("rmses.RData") library(plotly) plot_ly(x = rmse_gb, type = "box", name="GB") %>% add_trace(x = rmse_nn, name="NN") %>% layout(xaxis = list(title = "rmse"), yaxis=list(autorange="reversed"))
siehe: tuneNN.R, tuneXgboost.R
load("resultFilexgboost.RData") source("plotting.R") plot_parallel(whichopt(resultpp2),yrange = c(1,5))
load("resultFileNN.RData") plot_parallel(whichopt(resultpp2), yrange = c(1,5))
plot_ly(x = rmse_gb, type = "box", name="GB") %>% add_trace(x = rmse_nn, name="NN") %>% add_trace(x = rmse_gb_tuned, name="GB*") %>% add_trace(x = rmse_nn_tuned, name="NN*") %>% layout(xaxis = list(title = "rmse"), yaxis=list(autorange="reversed"))