diff --git a/03_classification.ipynb b/03_classification.ipynb index de19856..25c8b98 100644 --- a/03_classification.ipynb +++ b/03_classification.ipynb @@ -1038,6 +1038,7 @@ "metadata": {}, "outputs": [], "source": [ + "np.random.seed(42)\n", "noise = np.random.randint(0, 100, (len(X_train), 784))\n", "X_train_mod = X_train + noise\n", "noise = np.random.randint(0, 100, (len(X_test), 784))\n", @@ -1403,7 +1404,6 @@ "metadata": {}, "outputs": [], "source": [ - "from pathlib import Path\n", "import pandas as pd\n", "import urllib.request\n", "\n", @@ -1624,7 +1624,6 @@ "source": [ "from sklearn.pipeline import Pipeline\n", "from sklearn.impute import SimpleImputer\n", - "from sklearn.preprocessing import StandardScaler\n", "\n", "num_pipeline = Pipeline([\n", " (\"imputer\", SimpleImputer(strategy=\"median\")),\n", @@ -1731,8 +1730,6 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.ensemble import RandomForestClassifier\n", - "\n", "forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)\n", "forest_clf.fit(X_train, y_train)" ] @@ -1767,8 +1764,6 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.model_selection import cross_val_score\n", - "\n", "forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)\n", "forest_scores.mean()" ] @@ -1820,8 +1815,6 @@ "metadata": {}, "outputs": [], "source": [ - "import matplotlib.pyplot as plt\n", - "\n", "plt.figure(figsize=(8, 4))\n", "plt.plot([1]*10, svm_scores, \".\")\n", "plt.plot([2]*10, forest_scores, \".\")\n", @@ -1867,7 +1860,8 @@ "outputs": [], "source": [ "train_data[\"RelativesOnboard\"] = train_data[\"SibSp\"] + train_data[\"Parch\"]\n", - "train_data[[\"RelativesOnboard\", \"Survived\"]].groupby(['RelativesOnboard']).mean()" + "train_data[[\"RelativesOnboard\", \"Survived\"]].groupby(\n", + " ['RelativesOnboard']).mean()" ] }, { @@ -1899,9 +1893,7 @@ "metadata": {}, "outputs": [], "source": [ - "from pathlib import Path\n", "import tarfile\n", - "import urllib.request\n", "\n", "def fetch_spam_data():\n", " root = \"http://spamassassin.apache.org/old/publiccorpus/\"\n", @@ -2141,7 +2133,8 @@ "X = np.array(ham_emails + spam_emails, dtype=object)\n", "y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))\n", "\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)" + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,\n", + " random_state=42)" ] }, { @@ -2247,9 +2240,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's throw in some stemming! For this to work, you need to install the Natural Language Toolkit ([NLTK](http://www.nltk.org/)). It's as simple as running the following command (don't forget to activate your virtualenv first; if you don't have one, you will likely need administrator rights, or use the `--user` option):\n", - "\n", - "`$ pip3 install nltk`" + "Let's throw in some stemming! We will use the Natural Language Toolkit ([NLTK](http://www.nltk.org/)):" ] }, { @@ -2258,24 +2249,19 @@ "metadata": {}, "outputs": [], "source": [ - "try:\n", - " import nltk\n", + "import nltk\n", "\n", - " stemmer = nltk.PorterStemmer()\n", - " for word in (\"Computations\", \"Computation\", \"Computing\", \"Computed\", \"Compute\", \"Compulsive\"):\n", - " print(word, \"=>\", stemmer.stem(word))\n", - "except ImportError:\n", - " print(\"Error: stemming requires the NLTK module.\")\n", - " stemmer = None" + "stemmer = nltk.PorterStemmer()\n", + "for word in (\"Computations\", \"Computation\", \"Computing\", \"Computed\", \"Compute\",\n", + " \"Compulsive\"):\n", + " print(word, \"=>\", stemmer.stem(word))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We will also need a way to replace URLs with the word \"URL\". For this, we could use hard core [regular expressions](https://mathiasbynens.be/demo/url-regex) but we will just use the [urlextract](https://github.com/lipoja/URLExtract) library. You can install it with the following command (don't forget to activate your virtualenv first; if you don't have one, you will likely need administrator rights, or use the `--user` option):\n", - "\n", - "`$ pip3 install urlextract`" + "We will also need a way to replace URLs with the word \"URL\". For this, we could use hard core [regular expressions](https://mathiasbynens.be/demo/url-regex) but we will just use the [urlextract](https://github.com/lipoja/URLExtract) library:" ] }, { @@ -2306,14 +2292,12 @@ "metadata": {}, "outputs": [], "source": [ - "try:\n", - " import urlextract # may require an Internet connection to download root domain names\n", - " \n", - " url_extractor = urlextract.URLExtract()\n", - " print(url_extractor.find_urls(\"Will it detect github.com and https://youtu.be/7Pq-S557XQU?t=3m32s\"))\n", - "except ImportError:\n", - " print(\"Error: replacing URLs requires the urlextract module.\")\n", - " url_extractor = None" + "import urlextract # may require an Internet connection to download root domain\n", + " # names\n", + "\n", + "url_extractor = urlextract.URLExtract()\n", + "some_text = \"Will it detect github.com and https://youtu.be/7Pq-S557XQU?t=3m32s\"\n", + "print(url_extractor.find_urls(some_text))" ] }, { @@ -2332,8 +2316,9 @@ "from sklearn.base import BaseEstimator, TransformerMixin\n", "\n", "class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):\n", - " def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,\n", - " replace_urls=True, replace_numbers=True, stemming=True):\n", + " def __init__(self, strip_headers=True, lower_case=True,\n", + " remove_punctuation=True, replace_urls=True,\n", + " replace_numbers=True, stemming=True):\n", " self.strip_headers = strip_headers\n", " self.lower_case = lower_case\n", " self.remove_punctuation = remove_punctuation\n", @@ -2417,7 +2402,8 @@ " for word, count in word_count.items():\n", " total_count[word] += min(count, 10)\n", " most_common = total_count.most_common()[:self.vocabulary_size]\n", - " self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}\n", + " self.vocabulary_ = {word: index + 1\n", + " for index, (word, count) in enumerate(most_common)}\n", " return self\n", " def transform(self, X, y=None):\n", " rows = []\n", @@ -2428,7 +2414,8 @@ " rows.append(row)\n", " cols.append(self.vocabulary_.get(word, 0))\n", " data.append(count)\n", - " return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))" + " return csr_matrix((data, (rows, cols)),\n", + " shape=(len(X), self.vocabulary_size + 1))" ] }, { @@ -2490,13 +2477,6 @@ "X_train_transformed = preprocess_pipeline.fit_transform(X_train)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Note**: to be future-proof, we set `solver=\"lbfgs\"` since this will be the default value in Scikit-Learn 0.22." - ] - }, { "cell_type": "code", "execution_count": 145, @@ -2507,7 +2487,7 @@ "from sklearn.model_selection import cross_val_score\n", "\n", "log_clf = LogisticRegression(max_iter=1000, random_state=42)\n", - "score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3, verbose=3)\n", + "score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3)\n", "score.mean()" ] },