{ "cells": [ { "cell_type": "code", "execution_count": 97, "id": "e2813538", "metadata": {}, "outputs": [], "source": [ "import time\n", "\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", "from scipy.linalg import svd\n", "\n", "from sklearn.datasets import load_svmlight_file\n", "from sklearn.metrics.pairwise import rbf_kernel\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import StandardScaler\n", "\n", "rng = np.random.default_rng(42)" ] }, { "cell_type": "code", "execution_count": 98, "id": "641aeaac", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(49990, 22) (49990,)\n" ] } ], "source": [ "X, y = load_svmlight_file(\"./data/ijcnn1.bz2\")\n", "print(X.shape, y.shape)" ] }, { "cell_type": "code", "execution_count": 99, "id": "0e003e1e", "metadata": {}, "outputs": [], "source": [ "scaler = StandardScaler(with_mean=False)\n", "X_scaled = scaler.fit_transform(X)" ] }, { "cell_type": "code", "execution_count": 100, "id": "a3a85d20", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((20000, 22), (29990, 22))" ] }, "execution_count": 100, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train, X_test, y_train, y_test = train_test_split(\n", " X_scaled,\n", " y,\n", " test_size=29990 / 49990,\n", " random_state=42,\n", ")\n", "\n", "X_train.shape, X_test.shape" ] }, { "cell_type": "code", "execution_count": 101, "id": "b78a08ae", "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import accuracy_score\n", "from sklearn.svm import LinearSVC\n", "\n", "clf = LinearSVC(dual=False, random_state=42)\n", "\n", "start_time = time.time()\n", "clf.fit(X_train, y_train)\n", "training_time_svc = time.time() - start_time\n", "y_pred_svc = clf.predict(X_test)\n", "accuracy_svc = accuracy_score(y_test, y_pred_svc)" ] }, { "cell_type": "code", "execution_count": 102, "id": "8cb70eb0", "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVC\n", "\n", "clf_rbf = SVC(kernel=\"rbf\", random_state=42)\n", "\n", "start_time = time.time()\n", "clf_rbf.fit(X_train, y_train)\n", "training_time_rbf = time.time() - start_time\n", "y_pred_rbf = clf_rbf.predict(X_test)\n", "accuracy_rbf = accuracy_score(y_test, y_pred_rbf)" ] }, { "cell_type": "code", "execution_count": 103, "id": "cf96d113", "metadata": {}, "outputs": [], "source": [ "def compute_rkf(\n", " X_train: np.ndarray,\n", " X_test: np.ndarray,\n", " c: int = 300,\n", " gamma: float | None = None,\n", ") -> tuple[np.ndarray, np.ndarray]:\n", " \"\"\"Compute Random Kitchen Features for RBF kernel approximation.\n", "\n", " Args:\n", " X_train (np.ndarray): Training data of shape (n1, p).\n", " X_test (np.ndarray): Test data of shape (n2, p).\n", " c (int): Number of random features to generate.\n", " gamma (float | None): Kernel coefficient for RBF kernel. If None, defaults to 1/p.\n", "\n", " Returns:\n", " tuple[np.ndarray, np.ndarray]: Transformed training and test data of shapes (n1, c) and (n2, c).\n", "\n", " \"\"\"\n", " p = X_train.shape[1]\n", "\n", " if gamma is None:\n", " gamma = 1 / p\n", "\n", " W = rng.normal(0, np.sqrt(2 * gamma), size=(p, c))\n", " b = rng.uniform(0, 2 * np.pi, size=(1, c))\n", "\n", " def transform(X: np.ndarray) -> np.ndarray:\n", " \"\"\"Transform the input data using Random Kitchen Features.\"\"\"\n", " return np.sqrt(2 / c) * np.cos(np.dot(X, W) + b)\n", "\n", " return transform(X_train), transform(X_test)\n" ] }, { "cell_type": "code", "execution_count": 104, "id": "d0487ec1", "metadata": {}, "outputs": [], "source": [ "Z_train, Z_test = compute_rkf(X_train.toarray(), X_test.toarray(), c=300)\n", "\n", "clf_rkf = LinearSVC(dual=False, random_state=42)\n", "\n", "start_time = time.time()\n", "clf_rkf.fit(Z_train, y_train)\n", "training_time_rkf = time.time() - start_time\n", "\n", "y_pred_rkf = clf_rkf.predict(Z_test)\n", "accuracy_rkf = accuracy_score(y_test, y_pred_rkf)" ] }, { "cell_type": "code", "execution_count": 105, "id": "bf07c16e", "metadata": {}, "outputs": [ { "data": { "application/vnd.microsoft.datawrangler.viewer.v0+json": { "columns": [ { "name": "index", "rawType": "int64", "type": "integer" }, { "name": "Méthode", "rawType": "object", "type": "string" }, { "name": "Accuracy", "rawType": "float64", "type": "float" }, { "name": "Temps (s)", "rawType": "float64", "type": "float" } ], "ref": "5e2536cf-d951-4c4a-9a11-c620b11f6cce", "rows": [ [ "0", "Linear SVM (Raw)", "0.9221407135711904", "0.08201122283935547" ], [ "1", "Full RBF SVM", "0.9737912637545849", "1.1328659057617188" ], [ "2", "RKF (c=300)", "0.9551517172390797", "0.713508129119873" ] ], "shape": { "columns": 3, "rows": 3 } }, "text/html": [ "
| \n", " | Méthode | \n", "Accuracy | \n", "Temps (s) | \n", "
|---|---|---|---|
| 0 | \n", "Linear SVM (Raw) | \n", "0.922141 | \n", "0.082011 | \n", "
| 1 | \n", "Full RBF SVM | \n", "0.973791 | \n", "1.132866 | \n", "
| 2 | \n", "RKF (c=300) | \n", "0.955152 | \n", "0.713508 | \n", "