shorten lec 35

papajohn · papajohn · commit 94c2e810d6b4 · 2026-04-24T10:16:08.000-07:00
diff --git a/lec/lec35/lec35.ipynb b/lec/lec35/lec35.ipynb
@@ -248,249 +248,6 @@
     "\n",
     "make_array(left, right)"
    ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Text Classification"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from datasets import load_dataset\n",
-    "\n",
-    "sms = load_dataset('ucirvine/sms_spam', split='train').shuffle(seed=42)\n",
-    "sms_texts = np.array(sms['sms'])\n",
-    "sms_labels = np.array(sms['label'])\n",
-    "\n",
-    "sms_tbl = Table().with_columns('Text', sms_texts, 'Class', sms_labels)\n",
-    "sms_tbl.group('Class').show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sms_tbl.where('Class', 1).sample(with_replacement=False).show(5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sms_tbl.where('Class', 0).sample(with_replacement=False).show(5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "texts = sms_tbl.column('Text')\n",
-    "\n",
-    "sms_data = Table().with_columns(\n",
-    "    'Chars', np.char.str_len(texts),\n",
-    "    'Digits', sum(np.char.count(texts, str(d)) for d in range(10)),\n",
-    "    'Caps', sum(np.char.count(texts, chr(c)) for c in range(65, 91)),\n",
-    "    'Exclamations', np.char.count(texts, '!'),\n",
-    "    'Class', sms_tbl.column('Class')\n",
-    ")\n",
-    "sms_data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "sms_data.scatter('Digits', 'Caps', group='Class')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "shuffled = sms_data.sample(with_replacement=False)\n",
-    "test_size = 100\n",
-    "train_sms = shuffled.take(np.arange(test_size, shuffled.num_rows))\n",
-    "test_sms = shuffled.take(np.arange(test_size))\n",
-    "\n",
-    "print('Training:', train_sms.num_rows, ' Test:', test_sms.num_rows)\n",
-    "evaluate_accuracy(train_sms, test_sms, 5)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Rotten Tomatoes Movie Reviews"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "reviews_full = load_dataset('rotten_tomatoes', split='train')\n",
-    "reviews_short = reviews_full.filter(lambda x: 5 <= len(x['text'].split()) <= 10)\n",
-    "\n",
-    "reviews = Table().with_columns('Text', reviews_short['text'],\n",
-    "                               'Class', reviews_short['label'])\n",
-    "reviews = reviews.sample(with_replacement=False)  # Permute the rows\n",
-    "reviews.group('Class')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "reviews.sample(5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "words = [  # The most common adjectives in the data\n",
-    "    'good', 'bad', 'funny', 'little', 'much', 'new', 'best',\n",
-    "    'many', 'own', 'other', 'big', 'great', 'most', 'few',\n",
-    "    'real', 'first', 'full', 'american', 'romantic', 'same', 'old',\n",
-    "    'better', 'young', 'original', 'interesting', 'human',\n",
-    "    'hard', 'cinematic', 'enough', 'emotional', 'last', 'least', 'long',\n",
-    "    'true', 'predictable', 'visual', 'whole', 'high', 'special',\n",
-    "    'entertaining', 'sweet', 'enjoyable', 'narrative', 'familiar'\n",
-    "]\n",
-    "counts = Table(['Word', 'Positive', 'Negative'])\n",
-    "for word in words:\n",
-    "    has_word = reviews.where('Text', are.containing(word))\n",
-    "    counts = counts.with_row([word, has_word.where('Class', 1).num_rows,\n",
-    "                                    has_word.where('Class', 0).num_rows])\n",
-    "\n",
-    "counts"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "reviews.where('Text', are.containing('funny')).where('Class', 0).sample(5, with_replacement=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "texts = reviews.column('Text')\n",
-    "review_words = Table().with_column('Class', reviews.column('Class'))\n",
-    "for word in words:\n",
-    "    review_words = review_words.with_column(word, np.char.count(np.char.lower(texts), word))\n",
-    "\n",
-    "review_words.sample(5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "train_reviews = review_words.take(np.arange(test_size, reviews.num_rows))\n",
-    "test_reviews = review_words.take(np.arange(test_size))\n",
-    "\n",
-    "print('Word-count KNN:')\n",
-    "evaluate_accuracy(train_reviews, test_reviews, 5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "classify_all(train_reviews, test_reviews, 5).pivot('Prediction', 'Class')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Sentence Embeddings"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from sentence_transformers import SentenceTransformer\n",
-    "\n",
-    "embedder = SentenceTransformer('all-MiniLM-L6-v2')\n",
-    "review_emb = embedder.encode(list(reviews.column('Text')), show_progress_bar=True)\n",
-    "print('Embedding shape:', review_emb.shape)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "n_features = 64  # Increasing this will help, but above 128 datahub will crash\n",
-    "\n",
-    "cols = ['Class', reviews.column('Class')]\n",
-    "for i in range(n_features):\n",
-    "    cols += [f'Embed{i}', review_emb[:, i]]\n",
-    "\n",
-    "review_emb_table = Table().with_columns(*cols)\n",
-    "review_emb_table.row(0)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "train = review_emb_table.take(np.arange(test_size, reviews.num_rows))\n",
-    "test = review_emb_table.take(np.arange(test_size))\n",
-    "evaluate_accuracy(train, test, 5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "classify_all(train, test, 5).pivot('Prediction', 'Class')"
-   ]
   }
  ],
  "metadata": {