diff --git a/News_category_project/.ipynb_checkpoints/News_cat-checkpoint.ipynb b/News_category_project/.ipynb_checkpoints/News_cat-checkpoint.ipynb new file mode 100644 index 0000000..fc54971 --- /dev/null +++ b/News_category_project/.ipynb_checkpoints/News_cat-checkpoint.ipynb @@ -0,0 +1,2856 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# News Category Finder using NLP\n", + "\n", + "\n", + "Please check the test, train and sample submission files\n", + "#### Importing Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import nltk" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "train = pd.read_excel('Data_Train.xlsx', sheet_name='Sheet1')\n", + "test = pd.read_excel('Data_Test.xlsx', sheet_name='Sheet1')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
STORYSECTION
0But the most painful was the huge reversal in ...3
1How formidable is the opposition alliance amon...0
2Most Asian currencies were trading lower today...3
3If you want to answer any question, click on ‘...1
4In global markets, gold prices edged up today ...3
\n", + "
" + ], + "text/plain": [ + " STORY SECTION\n", + "0 But the most painful was the huge reversal in ... 3\n", + "1 How formidable is the opposition alliance amon... 0\n", + "2 Most Asian currencies were trading lower today... 3\n", + "3 If you want to answer any question, click on ‘... 1\n", + "4 In global markets, gold prices edged up today ... 3" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
STORY
02019 will see gadgets like gaming smartphones ...
1It has also unleashed a wave of changes in the...
2It can be confusing to pick the right smartpho...
3The mobile application is integrated with a da...
4We have rounded up some of the gadgets that sh...
\n", + "
" + ], + "text/plain": [ + " STORY\n", + "0 2019 will see gadgets like gaming smartphones ...\n", + "1 It has also unleashed a wave of changes in the...\n", + "2 It can be confusing to pick the right smartpho...\n", + "3 The mobile application is integrated with a da...\n", + "4 We have rounded up some of the gadgets that sh..." + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SECTION
count7628.000000
mean1.357892
std0.999341
min0.000000
25%1.000000
50%1.000000
75%2.000000
max3.000000
\n", + "
" + ], + "text/plain": [ + " SECTION\n", + "count 7628.000000\n", + "mean 1.357892\n", + "std 0.999341\n", + "min 0.000000\n", + "25% 1.000000\n", + "50% 1.000000\n", + "75% 2.000000\n", + "max 3.000000" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
STORY
countuniquetopfreq
SECTION
016861673This story has been published from a wire agen...4
127722731This story has been published from a wire agen...13
219241914The consensus reads, “Exciting, entertaining, ...3
312461233This story has been published from a wire agen...11
\n", + "
" + ], + "text/plain": [ + " STORY \n", + " count unique top freq\n", + "SECTION \n", + "0 1686 1673 This story has been published from a wire agen... 4\n", + "1 2772 2731 This story has been published from a wire agen... 13\n", + "2 1924 1914 The consensus reads, “Exciting, entertaining, ... 3\n", + "3 1246 1233 This story has been published from a wire agen... 11" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train.groupby('SECTION').describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Tokenize" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from nltk.tokenize import word_tokenize" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "from nltk.corpus import stopwords" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "running\n", + "run\n" + ] + } + ], + "source": [ + "from nltk.stem.wordnet import WordNetLemmatizer \n", + "lem = WordNetLemmatizer()\n", + "\n", + "from nltk.stem.porter import PorterStemmer \n", + "stem = PorterStemmer()\n", + "\n", + "word = \"running\" \n", + "print(lem.lemmatize(word))\n", + "\n", + "print(stem.stem(word))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "from nltk.stem.wordnet import WordNetLemmatizer \n", + "from nltk.stem.porter import PorterStemmer \n", + "lem = WordNetLemmatizer()\n", + "stem = PorterStemmer()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Processing Tokens" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import string\n", + "\n", + "def text_process(mess):\n", + " \"\"\"\n", + " Takes in a string of text, then performs the following:\n", + " 1. Remove all punctuation\n", + " 2. Remove all stopwords\n", + " 3. Returns a list of the cleaned text\n", + " \"\"\"\n", + " # Check characters to see if they are in punctuation\n", + " nopunc = [char for char in mess if char not in string.punctuation]\n", + "\n", + " # Join the characters again to form the string.\n", + " nopunc = ''.join(nopunc)\n", + " \n", + " token = word_tokenize(nopunc)\n", + " \n", + " # Now just remove any stopwords\n", + " no_noise = [word for word in token if word.lower() not in (stopwords.words('english') and ['\"',\"'\",'`','”','“'])]\n", + " # Stemming\n", + " \n", + " \n", + " return no_noise\n", + "#[stem.stem(word.lower()) for word in no_noise]\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 [painful, huge, reversal, fee, income, unheard...\n", + "1 [formidable, opposition, alliance, among, Cong...\n", + "2 [Asian, currencies, trading, lower, today, Sou...\n", + "3 [want, answer, question, click, ‘, Answer, ’, ...\n", + "4 [global, markets, gold, prices, edged, today, ...\n", + "5 [BEIJING, Chinese, tech, giant, Huawei, announ...\n", + "6 [Mumbai, India, Incs, external, commercial, bo...\n", + "7 [Wednesday, Federal, Reserve, Chairman, Jerome...\n", + "8 [give, audience, already, done, Yeh, Hai, Aash...\n", + "9 [com, Arbaaz, Khan, spoke, getting, back, Daba...\n", + "10 [“, One, would, think, development, testing, p...\n", + "11 [far, year, rupee, gained, 07, foreign, invest...\n", + "12 [Xiaomi, however, sees, presence, Jio, rural, ...\n", + "13 [ad, reads, bells, whistles, Bezel, notch, app...\n", + "14 [Tuesday, Powell, said, healthy, US, economy, ...\n", + "15 [feature, help, make, display, responsive, int...\n", + "16 [TikTok, popular, among, children, facing, cri...\n", + "17 [company, hive, ratings, business, whollyowned...\n", + "18 [chooses, hide, CP, colleagues, move, mother, ...\n", + "19 [’, right, opera, house, simply, goes, show, A...\n", + "20 [Facebook, said, eligible, creators, would, ab...\n", + "21 [Starring, Varun, Dhawan, Alia, Bhatt, Sonaksh...\n", + "22 [GKN, Securities, barred, misuse, socalled, da...\n", + "23 [Fintech, startup, Zeta, cofounded, Bhavin, Tu...\n", + "24 [story, published, wire, agency, feed, without...\n", + "25 [Globally, established, companies, Stratasys, ...\n", + "26 [statements, Yeddyurappa, says, air, strikes, ...\n", + "27 [NDA, seeks, reelection, agriculture, form, im...\n", + "28 [Yeddyurappa, said, IAF, air, strikes, would, ...\n", + "29 [”, two, releases, year, far, Milan, Talkies, ...\n", + " ... \n", + "7598 [“, TDP, party, ideology, gotten, sidelined, e...\n", + "7599 [far, whenever, ’, reviewed, Kindles, Paperwhi...\n", + "7600 [day, markets, saw, high, volatility, followin...\n", + "7601 [Today, Gmail, allows, 15GB, free, storage, Us...\n", + "7602 [Aparajita, Sarangi, took, voluntary, retireme...\n", + "7603 [Investors, awaiting, economic, growth, data, ...\n", + "7604 [advice, online, survivalists, moves, unpopula...\n", + "7605 [time, developers, disguising, app, pretend, c...\n", + "7606 [Lok, Sabha, elections, 2019, Fifth, phase, vo...\n", + "7607 [watchdog, passed, five, separate, orders, tog...\n", + "7608 [Twitter, post, last, week, OnePlus, confirmed...\n", + "7609 [iOSonly, email, client, named, Spark, launche...\n", + "7610 [said, question, really, spend, ₹10000, portab...\n", + "7611 [’, want, kind, movie, felt, done, similar, ki...\n", + "7612 [1999, film, Mother, received, Best, Foreign, ...\n", + "7613 [Mohan, Babu, considered, political, heavyweig...\n", + "7614 [SP, 500, opened, higher, 126, points, 004, 28...\n", + "7615 [However, reports, suggest, would, another, sm...\n", + "7616 [Mumbai, Indian, stocks, rose, key, indices, e...\n", + "7617 [Sure, others, slightly, faster, slightly, sha...\n", + "7618 [147, million, pixels, one, billion, colours, ...\n", + "7619 [BJD, supporter, Puri, Congress, leaders, well...\n", + "7620 [Bollywood, celebrities, took, social, media, ...\n", + "7621 [However, confirmation, developers, games, wou...\n", + "7622 [terms, optics, back, Redmi, Note, 7, boasts, ...\n", + "7623 [Karnataka, Congress, bastion, also, gave, BJP...\n", + "7624 [film, also, features, Janhvi, Kapoor, revolve...\n", + "7625 [database, created, bringing, together, crimin...\n", + "7626 [state, uneasy, relationship, mainland, since,...\n", + "7627 [Virus, stars, Kunchacko, Boban, Tovino, Thoma...\n", + "Name: STORY, Length: 7628, dtype: object" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train['STORY'].apply(text_process)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Bag of words" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import CountVectorizer" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "bow_transformer = CountVectorizer(analyzer=text_process).fit(train['STORY'])" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "44346\n" + ] + } + ], + "source": [ + "print(len(bow_transformer.vocabulary_))" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " (0, 98)\t1\n", + " (0, 458)\t1\n", + " (0, 4354)\t1\n", + " (0, 6127)\t1\n", + " (0, 7903)\t1\n", + " (0, 17114)\t1\n", + " (0, 19913)\t1\n", + " (0, 20711)\t1\n", + " (0, 21236)\t1\n", + " (0, 22284)\t1\n", + " (0, 23796)\t1\n", + " (0, 24786)\t1\n", + " (0, 25194)\t1\n", + " (0, 25607)\t1\n", + " (0, 26393)\t1\n", + " (0, 26400)\t1\n", + " (0, 26889)\t1\n", + " (0, 27416)\t1\n", + " (0, 28063)\t1\n", + " (0, 28740)\t2\n", + " (0, 28793)\t2\n", + " (0, 29335)\t1\n", + " (0, 32289)\t3\n", + " (0, 33750)\t1\n", + " (0, 34213)\t1\n", + " (0, 34771)\t1\n", + " (0, 35747)\t1\n", + " (0, 37546)\t1\n", + " (0, 37647)\t1\n", + " (0, 39826)\t1\n", + " (0, 41132)\t2\n", + " (0, 42868)\t1\n", + "(1, 44346)\n" + ] + } + ], + "source": [ + "bow4 = bow_transformer.transform([train['STORY'][4]])\n", + "print(bow4) # vectors pointing from origin\n", + "print(bow4.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "markets\n", + "stock\n" + ] + } + ], + "source": [ + "print(bow_transformer.get_feature_names()[32289])\n", + "print(bow_transformer.get_feature_names()[39826])" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "messages_bow = bow_transformer.transform(train['STORY'])" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shape of Sparse Matrix: (7628, 44346)\n", + "Amount of Non-Zero occurences: 417825\n" + ] + } + ], + "source": [ + "print('Shape of Sparse Matrix: ', messages_bow.shape)\n", + "print('Amount of Non-Zero occurences: ', messages_bow.nnz)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sparsity: 0.12351772521704532\n" + ] + } + ], + "source": [ + "sparsity = (100.0 * messages_bow.nnz / (messages_bow.shape[0] * messages_bow.shape[1]))\n", + "\n", + "print(f'sparsity: {sparsity}')" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " (0, 42868)\t0.1653840117317503\n", + " (0, 41132)\t0.23264193296874802\n", + " (0, 39826)\t0.12464706628982292\n", + " (0, 37647)\t0.13059926000708844\n", + " (0, 37546)\t0.13533155309101896\n", + " (0, 35747)\t0.11016931263373622\n", + " (0, 34771)\t0.11197548409940615\n", + " (0, 34213)\t0.17507399610069765\n", + " (0, 33750)\t0.2329734713609702\n", + " (0, 32289)\t0.3411710195006078\n", + " (0, 29335)\t0.14164721721626908\n", + " (0, 28793)\t0.2878801405286177\n", + " (0, 28740)\t0.21695807780745277\n", + " (0, 28063)\t0.1342731302228393\n", + " (0, 27416)\t0.1771020108713445\n", + " (0, 26889)\t0.13365789947306533\n", + " (0, 26400)\t0.20124735910228528\n", + " (0, 26393)\t0.1288535100738698\n", + " (0, 25607)\t0.18575848290189373\n", + " (0, 25194)\t0.20712745650428582\n", + " (0, 24786)\t0.09579350098100424\n", + " (0, 23796)\t0.130962730415073\n", + " (0, 22284)\t0.14220199953463913\n", + " (0, 21236)\t0.09531611135230537\n", + " (0, 20711)\t0.18575848290189373\n", + " (0, 19913)\t0.14959653531880082\n", + " (0, 17114)\t0.17929921522347367\n", + " (0, 7903)\t0.15233170401746512\n", + " (0, 6127)\t0.11539264864412509\n", + " (0, 4354)\t0.14394007026430886\n", + " (0, 458)\t0.2436579581621663\n", + " (0, 98)\t0.16899942411275612\n" + ] + } + ], + "source": [ + "from sklearn.feature_extraction.text import TfidfTransformer\n", + "\n", + "tfidf_transformer = TfidfTransformer().fit(messages_bow)\n", + "\n", + "# TEST\n", + "tfidf4 = tfidf_transformer.transform(bow4)\n", + "print(tfidf4)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "7.74248747675401\n", + "8.33027414165613\n" + ] + } + ], + "source": [ + "print(tfidf_transformer.idf_[bow_transformer.vocabulary_['u']])\n", + "print(tfidf_transformer.idf_[bow_transformer.vocabulary_['university']])" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(7628, 44346)\n" + ] + } + ], + "source": [ + "messages_tfidf = tfidf_transformer.transform(messages_bow)\n", + "print(messages_tfidf.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<7628x44346 sparse matrix of type ''\n", + "\twith 417825 stored elements in Compressed Sparse Row format>" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "messages_tfidf" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "#classifier\n", + "from sklearn.naive_bayes import MultinomialNB \n", + "category_detect_model = MultinomialNB().fit(messages_tfidf, train['SECTION'])" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "predicted: 3\n", + "expected: 1\n" + ] + } + ], + "source": [ + "print('predicted:', category_detect_model.predict(tfidf4)[0])\n", + "print('expected:', train['SECTION'][3])" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[3 0 3 ... 1 0 2]\n" + ] + } + ], + "source": [ + "all_predictions = category_detect_model.predict(messages_tfidf)\n", + "print(all_predictions)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.97 0.97 0.97 1686\n", + " 1 0.95 1.00 0.97 2772\n", + " 2 1.00 0.96 0.98 1924\n", + " 3 1.00 0.95 0.97 1246\n", + "\n", + " accuracy 0.97 7628\n", + " macro avg 0.98 0.97 0.97 7628\n", + "weighted avg 0.97 0.97 0.97 7628\n", + "\n" + ] + } + ], + "source": [ + "from sklearn.metrics import classification_report\n", + "print (classification_report(train['SECTION'], all_predictions))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.pipeline import Pipeline\n", + "\n", + "pipeline = Pipeline([\n", + " ('bow', CountVectorizer(analyzer=text_process)), # strings to token integer counts\n", + " ('tfidf', TfidfTransformer()), # integer counts to weighted TF-IDF scores\n", + " ('classifier', MultinomialNB()), # train on TF-IDF vectors w/ Naive Bayes classifier\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Pipeline(memory=None,\n", + " steps=[('bow',\n", + " CountVectorizer(analyzer=,\n", + " binary=False, decode_error='strict',\n", + " dtype=, encoding='utf-8',\n", + " input='content', lowercase=True, max_df=1.0,\n", + " max_features=None, min_df=1,\n", + " ngram_range=(1, 1), preprocessor=None,\n", + " stop_words=None, strip_accents=None,\n", + " token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n", + " tokenizer=None, vocabulary=None)),\n", + " ('tfidf',\n", + " TfidfTransformer(norm='l2', smooth_idf=True,\n", + " sublinear_tf=False, use_idf=True)),\n", + " ('classifier',\n", + " MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],\n", + " verbose=False)" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.fit(train['STORY'],train['SECTION'])" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1 2 1 ... 1 0 1]\n" + ] + } + ], + "source": [ + "predictions = pipeline.predict(test['STORY'])\n", + "print(predictions)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SECTION
01
12
21
31
41
51
61
72
81
92
100
113
122
131
142
151
161
172
183
192
202
211
222
230
240
252
262
273
283
290
......
27180
27192
27203
27211
27220
27231
27242
27250
27261
27272
27281
27291
27303
27311
27323
27331
27340
27353
27360
27371
27381
27391
27403
27410
27420
27431
27441
27451
27460
27471
\n", + "

2748 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " SECTION\n", + "0 1\n", + "1 2\n", + "2 1\n", + "3 1\n", + "4 1\n", + "5 1\n", + "6 1\n", + "7 2\n", + "8 1\n", + "9 2\n", + "10 0\n", + "11 3\n", + "12 2\n", + "13 1\n", + "14 2\n", + "15 1\n", + "16 1\n", + "17 2\n", + "18 3\n", + "19 2\n", + "20 2\n", + "21 1\n", + "22 2\n", + "23 0\n", + "24 0\n", + "25 2\n", + "26 2\n", + "27 3\n", + "28 3\n", + "29 0\n", + "... ...\n", + "2718 0\n", + "2719 2\n", + "2720 3\n", + "2721 1\n", + "2722 0\n", + "2723 1\n", + "2724 2\n", + "2725 0\n", + "2726 1\n", + "2727 2\n", + "2728 1\n", + "2729 1\n", + "2730 3\n", + "2731 1\n", + "2732 3\n", + "2733 1\n", + "2734 0\n", + "2735 3\n", + "2736 0\n", + "2737 1\n", + "2738 1\n", + "2739 1\n", + "2740 3\n", + "2741 0\n", + "2742 0\n", + "2743 1\n", + "2744 1\n", + "2745 1\n", + "2746 0\n", + "2747 1\n", + "\n", + "[2748 rows x 1 columns]" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output = pd.DataFrame(predictions,columns=['SECTION'])\n", + "output" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "output.to_excel('output.xlsx',sheet_name='Sheet1',index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SECTIONSTORY
012019 will see gadgets like gaming smartphones ...
12It has also unleashed a wave of changes in the...
21It can be confusing to pick the right smartpho...
31The mobile application is integrated with a da...
41We have rounded up some of the gadgets that sh...
51\"Imagine if every message you sent was kept wi...
61Positioned along the four sides of the Asus RO...
72In fact, when I applied to USC film school the...
81As spotted by Android Police, Netflix is testi...
92Her moves were immaculately choreographed as s...
100The NCP leadership was under tremendous pressu...
113On the traded volume front, 48.67 lakh shares ...
122They wrote, “Welcoming Makkal Selvan Vijay Set...
131The back of the phone features a 12MP+13MP AI ...
142Talking about how the pressure got to her at o...
151The Xiaomi Play is expected to have a CPU runn...
161In January 2019, the telecom industry added 21...
172Apart from the finalists, the grand finale als...
183\"We expect (a) slew of REIT IPOs to hit the ma...
192According to the same report, Avengers Endgame...
202“Wounded by Lannister riders, they will seek r...
211With so much hatred around, through the show, ...
222The Force Awakens, incidentally, wind up as th...
230The two have been taking potshots at each othe...
240Thakur said “they (authorities) wanted to forc...
252“It’s seriously delightful that our new Dracul...
262(Photo: Diljit Dosanjh/Instagram)Diljit Dosanj...
273Some respite came from the 6% growth in utilit...
283The outlook for India's rupee has deteriorated...
290However, political analysts said that it will ...
.........
27180The party renominated its sitting MP Ranjet Ra...
27192Was there a moment you were bullied?I was very...
27203In Delhi, gold of 99.9% and 99.5% purities fel...
27211Snapchat's controversial and criticised redesi...
27220Of the eight states that faced single-phase po...
27231Facebook pulled 513 Pages, Groups and accounts...
27242One of the most popular female actors in the 1...
27250Two Telugu TV actresses died in a road acciden...
27261Google CEO Sundar Pichai introduced Duplex ear...
27272”Priyanka was replaced by Katrina Kaif, who wo...
27281The Chinese smartphone manufacturer today intr...
27291Samsung, the world’s largest smartphone maker,...
27303Reliance Securities has revised TCS target pri...
27311Music streaming may not be novel, but it’s sti...
27323Ericsson India Pvt. Ltd had moved the Supreme ...
27331On the other hand, the hotel and airline loyal...
27340The Narendra Modi government simply finished t...
27353The yield on 10-year Treasuries climbed two ba...
27360Section 126 of the Representation of People Ac...
27371With these techniques, machines are also learn...
27381The traffic challan payment is also restricted...
27391“We are not like Western countries, where peop...
27403Further, SBICAP Securities says delivery of th...
27410Two, Raj Thackeray’s Maharashtra Navnirman Sen...
27420Senior leaders of the BJP are using the sugges...
27431According to researchers, fraud in the mobile ...
27441The iPhone XS and XS Max share the Apple A12 c...
27451On the photography front, the Note 5 Pro featu...
27460UDAY mandated that discoms bring the gap betwe...
27471Ripple also helps bank customers send money to...
\n", + "

2748 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " SECTION STORY\n", + "0 1 2019 will see gadgets like gaming smartphones ...\n", + "1 2 It has also unleashed a wave of changes in the...\n", + "2 1 It can be confusing to pick the right smartpho...\n", + "3 1 The mobile application is integrated with a da...\n", + "4 1 We have rounded up some of the gadgets that sh...\n", + "5 1 \"Imagine if every message you sent was kept wi...\n", + "6 1 Positioned along the four sides of the Asus RO...\n", + "7 2 In fact, when I applied to USC film school the...\n", + "8 1 As spotted by Android Police, Netflix is testi...\n", + "9 2 Her moves were immaculately choreographed as s...\n", + "10 0 The NCP leadership was under tremendous pressu...\n", + "11 3 On the traded volume front, 48.67 lakh shares ...\n", + "12 2 They wrote, “Welcoming Makkal Selvan Vijay Set...\n", + "13 1 The back of the phone features a 12MP+13MP AI ...\n", + "14 2 Talking about how the pressure got to her at o...\n", + "15 1 The Xiaomi Play is expected to have a CPU runn...\n", + "16 1 In January 2019, the telecom industry added 21...\n", + "17 2 Apart from the finalists, the grand finale als...\n", + "18 3 \"We expect (a) slew of REIT IPOs to hit the ma...\n", + "19 2 According to the same report, Avengers Endgame...\n", + "20 2 “Wounded by Lannister riders, they will seek r...\n", + "21 1 With so much hatred around, through the show, ...\n", + "22 2 The Force Awakens, incidentally, wind up as th...\n", + "23 0 The two have been taking potshots at each othe...\n", + "24 0 Thakur said “they (authorities) wanted to forc...\n", + "25 2 “It’s seriously delightful that our new Dracul...\n", + "26 2 (Photo: Diljit Dosanjh/Instagram)Diljit Dosanj...\n", + "27 3 Some respite came from the 6% growth in utilit...\n", + "28 3 The outlook for India's rupee has deteriorated...\n", + "29 0 However, political analysts said that it will ...\n", + "... ... ...\n", + "2718 0 The party renominated its sitting MP Ranjet Ra...\n", + "2719 2 Was there a moment you were bullied?I was very...\n", + "2720 3 In Delhi, gold of 99.9% and 99.5% purities fel...\n", + "2721 1 Snapchat's controversial and criticised redesi...\n", + "2722 0 Of the eight states that faced single-phase po...\n", + "2723 1 Facebook pulled 513 Pages, Groups and accounts...\n", + "2724 2 One of the most popular female actors in the 1...\n", + "2725 0 Two Telugu TV actresses died in a road acciden...\n", + "2726 1 Google CEO Sundar Pichai introduced Duplex ear...\n", + "2727 2 ”Priyanka was replaced by Katrina Kaif, who wo...\n", + "2728 1 The Chinese smartphone manufacturer today intr...\n", + "2729 1 Samsung, the world’s largest smartphone maker,...\n", + "2730 3 Reliance Securities has revised TCS target pri...\n", + "2731 1 Music streaming may not be novel, but it’s sti...\n", + "2732 3 Ericsson India Pvt. Ltd had moved the Supreme ...\n", + "2733 1 On the other hand, the hotel and airline loyal...\n", + "2734 0 The Narendra Modi government simply finished t...\n", + "2735 3 The yield on 10-year Treasuries climbed two ba...\n", + "2736 0 Section 126 of the Representation of People Ac...\n", + "2737 1 With these techniques, machines are also learn...\n", + "2738 1 The traffic challan payment is also restricted...\n", + "2739 1 “We are not like Western countries, where peop...\n", + "2740 3 Further, SBICAP Securities says delivery of th...\n", + "2741 0 Two, Raj Thackeray’s Maharashtra Navnirman Sen...\n", + "2742 0 Senior leaders of the BJP are using the sugges...\n", + "2743 1 According to researchers, fraud in the mobile ...\n", + "2744 1 The iPhone XS and XS Max share the Apple A12 c...\n", + "2745 1 On the photography front, the Note 5 Pro featu...\n", + "2746 0 UDAY mandated that discoms bring the gap betwe...\n", + "2747 1 Ripple also helps bank customers send money to...\n", + "\n", + "[2748 rows x 2 columns]" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output['STORY'] = test['STORY']\n", + "output" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.svm import SVC\n", + "from sklearn.feature_extraction.text import TfidfTransformer" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']}\n", + "from sklearn.model_selection import GridSearchCV\n", + "grid = GridSearchCV(SVC(),param_grid,verbose=3)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.pipeline import Pipeline\n", + "\n", + "pipeline = Pipeline([\n", + " ('bow', CountVectorizer(analyzer=text_process)), # strings to token integer counts\n", + " ('tfidf', TfidfTransformer()), # integer counts to weighted TF-IDF scores\n", + " ('classifier', grid), # train on TF-IDF vectors w/ Naive Bayes classifier\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\Subham\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_split.py:1978: FutureWarning: The default value of cv will change from 3 to 5 in version 0.22. Specify it explicitly to silence this warning.\n", + " warnings.warn(CV_WARNING, FutureWarning)\n", + "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting 3 folds for each of 25 candidates, totalling 75 fits\n", + "[CV] C=0.1, gamma=1, kernel=rbf ......................................\n", + "[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.561, total= 30.1s\n", + "[CV] C=0.1, gamma=1, kernel=rbf ......................................\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 30.0s remaining: 0.0s\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.561, total= 30.8s\n", + "[CV] C=0.1, gamma=1, kernel=rbf ......................................\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=1)]: Done 2 out of 2 | elapsed: 1.0min remaining: 0.0s\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.560, total= 29.4s\n", + "[CV] C=0.1, gamma=0.1, kernel=rbf ....................................\n", + "[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.391, total= 29.9s\n", + "[CV] C=0.1, gamma=0.1, kernel=rbf ....................................\n", + "[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.385, total= 29.3s\n", + "[CV] C=0.1, gamma=0.1, kernel=rbf ....................................\n", + "[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.387, total= 29.3s\n", + "[CV] C=0.1, gamma=0.01, kernel=rbf ...................................\n", + "[CV] ....... C=0.1, gamma=0.01, kernel=rbf, score=0.363, total= 29.2s\n", + "[CV] C=0.1, gamma=0.01, kernel=rbf ...................................\n", + "[CV] ....... C=0.1, gamma=0.01, kernel=rbf, score=0.363, total= 28.8s\n", + "[CV] C=0.1, gamma=0.01, kernel=rbf ...................................\n", + "[CV] ....... C=0.1, gamma=0.01, kernel=rbf, score=0.363, total= 29.3s\n", + "[CV] C=0.1, gamma=0.001, kernel=rbf ..................................\n", + "[CV] ...... C=0.1, gamma=0.001, kernel=rbf, score=0.363, total= 28.4s\n", + "[CV] C=0.1, gamma=0.001, kernel=rbf ..................................\n", + "[CV] ...... C=0.1, gamma=0.001, kernel=rbf, score=0.363, total= 27.5s\n", + "[CV] C=0.1, gamma=0.001, kernel=rbf ..................................\n", + "[CV] ...... C=0.1, gamma=0.001, kernel=rbf, score=0.363, total= 27.8s\n", + "[CV] C=0.1, gamma=0.0001, kernel=rbf .................................\n", + "[CV] ..... C=0.1, gamma=0.0001, kernel=rbf, score=0.363, total= 27.6s\n", + "[CV] C=0.1, gamma=0.0001, kernel=rbf .................................\n", + "[CV] ..... C=0.1, gamma=0.0001, kernel=rbf, score=0.363, total= 27.5s\n", + "[CV] C=0.1, gamma=0.0001, kernel=rbf .................................\n", + "[CV] ..... C=0.1, gamma=0.0001, kernel=rbf, score=0.363, total= 28.6s\n", + "[CV] C=1, gamma=1, kernel=rbf ........................................\n", + "[CV] ............ C=1, gamma=1, kernel=rbf, score=0.961, total= 26.9s\n", + "[CV] C=1, gamma=1, kernel=rbf ........................................\n", + "[CV] ............ C=1, gamma=1, kernel=rbf, score=0.956, total= 26.6s\n", + "[CV] C=1, gamma=1, kernel=rbf ........................................\n", + "[CV] ............ C=1, gamma=1, kernel=rbf, score=0.962, total= 26.6s\n", + "[CV] C=1, gamma=0.1, kernel=rbf ......................................\n", + "[CV] .......... C=1, gamma=0.1, kernel=rbf, score=0.940, total= 18.9s\n", + "[CV] C=1, gamma=0.1, kernel=rbf ......................................\n", + "[CV] .......... C=1, gamma=0.1, kernel=rbf, score=0.937, total= 18.7s\n", + "[CV] C=1, gamma=0.1, kernel=rbf ......................................\n", + "[CV] .......... C=1, gamma=0.1, kernel=rbf, score=0.941, total= 19.0s\n", + "[CV] C=1, gamma=0.01, kernel=rbf .....................................\n", + "[CV] ......... C=1, gamma=0.01, kernel=rbf, score=0.416, total= 27.8s\n", + "[CV] C=1, gamma=0.01, kernel=rbf .....................................\n", + "[CV] ......... C=1, gamma=0.01, kernel=rbf, score=0.417, total= 27.7s\n", + "[CV] C=1, gamma=0.01, kernel=rbf .....................................\n", + "[CV] ......... C=1, gamma=0.01, kernel=rbf, score=0.417, total= 27.9s\n", + "[CV] C=1, gamma=0.001, kernel=rbf ....................................\n", + "[CV] ........ C=1, gamma=0.001, kernel=rbf, score=0.363, total= 28.2s\n", + "[CV] C=1, gamma=0.001, kernel=rbf ....................................\n", + "[CV] ........ C=1, gamma=0.001, kernel=rbf, score=0.363, total= 28.2s\n", + "[CV] C=1, gamma=0.001, kernel=rbf ....................................\n", + "[CV] ........ C=1, gamma=0.001, kernel=rbf, score=0.363, total= 28.4s\n", + "[CV] C=1, gamma=0.0001, kernel=rbf ...................................\n", + "[CV] ....... C=1, gamma=0.0001, kernel=rbf, score=0.363, total= 28.8s\n", + "[CV] C=1, gamma=0.0001, kernel=rbf ...................................\n", + "[CV] ....... C=1, gamma=0.0001, kernel=rbf, score=0.363, total= 28.8s\n", + "[CV] C=1, gamma=0.0001, kernel=rbf ...................................\n", + "[CV] ....... C=1, gamma=0.0001, kernel=rbf, score=0.363, total= 31.9s\n", + "[CV] C=10, gamma=1, kernel=rbf .......................................\n", + "[CV] ........... C=10, gamma=1, kernel=rbf, score=0.963, total= 31.7s\n", + "[CV] C=10, gamma=1, kernel=rbf .......................................\n", + "[CV] ........... C=10, gamma=1, kernel=rbf, score=0.958, total= 31.3s\n", + "[CV] C=10, gamma=1, kernel=rbf .......................................\n", + "[CV] ........... C=10, gamma=1, kernel=rbf, score=0.964, total= 28.3s\n", + "[CV] C=10, gamma=0.1, kernel=rbf .....................................\n", + "[CV] ......... C=10, gamma=0.1, kernel=rbf, score=0.972, total= 14.4s\n", + "[CV] C=10, gamma=0.1, kernel=rbf .....................................\n", + "[CV] ......... C=10, gamma=0.1, kernel=rbf, score=0.965, total= 14.5s\n", + "[CV] C=10, gamma=0.1, kernel=rbf .....................................\n", + "[CV] ......... C=10, gamma=0.1, kernel=rbf, score=0.970, total= 14.5s\n", + "[CV] C=10, gamma=0.01, kernel=rbf ....................................\n", + "[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.947, total= 18.0s\n", + "[CV] C=10, gamma=0.01, kernel=rbf ....................................\n", + "[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.942, total= 17.7s\n", + "[CV] C=10, gamma=0.01, kernel=rbf ....................................\n", + "[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.945, total= 18.1s\n", + "[CV] C=10, gamma=0.001, kernel=rbf ...................................\n", + "[CV] ....... C=10, gamma=0.001, kernel=rbf, score=0.419, total= 27.9s\n", + "[CV] C=10, gamma=0.001, kernel=rbf ...................................\n", + "[CV] ....... C=10, gamma=0.001, kernel=rbf, score=0.421, total= 27.9s\n", + "[CV] C=10, gamma=0.001, kernel=rbf ...................................\n", + "[CV] ....... C=10, gamma=0.001, kernel=rbf, score=0.420, total= 29.2s\n", + "[CV] C=10, gamma=0.0001, kernel=rbf ..................................\n", + "[CV] ...... C=10, gamma=0.0001, kernel=rbf, score=0.363, total= 28.3s\n", + "[CV] C=10, gamma=0.0001, kernel=rbf ..................................\n", + "[CV] ...... C=10, gamma=0.0001, kernel=rbf, score=0.363, total= 28.5s\n", + "[CV] C=10, gamma=0.0001, kernel=rbf ..................................\n", + "[CV] ...... C=10, gamma=0.0001, kernel=rbf, score=0.363, total= 28.2s\n", + "[CV] C=100, gamma=1, kernel=rbf ......................................\n", + "[CV] .......... C=100, gamma=1, kernel=rbf, score=0.963, total= 27.5s\n", + "[CV] C=100, gamma=1, kernel=rbf ......................................\n", + "[CV] .......... C=100, gamma=1, kernel=rbf, score=0.958, total= 27.2s\n", + "[CV] C=100, gamma=1, kernel=rbf ......................................\n", + "[CV] .......... C=100, gamma=1, kernel=rbf, score=0.964, total= 27.5s\n", + "[CV] C=100, gamma=0.1, kernel=rbf ....................................\n", + "[CV] ........ C=100, gamma=0.1, kernel=rbf, score=0.972, total= 14.2s\n", + "[CV] C=100, gamma=0.1, kernel=rbf ....................................\n", + "[CV] ........ C=100, gamma=0.1, kernel=rbf, score=0.964, total= 14.5s\n", + "[CV] C=100, gamma=0.1, kernel=rbf ....................................\n", + "[CV] ........ C=100, gamma=0.1, kernel=rbf, score=0.971, total= 14.6s\n", + "[CV] C=100, gamma=0.01, kernel=rbf ...................................\n", + "[CV] ....... C=100, gamma=0.01, kernel=rbf, score=0.972, total= 13.6s\n", + "[CV] C=100, gamma=0.01, kernel=rbf ...................................\n", + "[CV] ....... C=100, gamma=0.01, kernel=rbf, score=0.965, total= 13.7s\n", + "[CV] C=100, gamma=0.01, kernel=rbf ...................................\n", + "[CV] ....... C=100, gamma=0.01, kernel=rbf, score=0.970, total= 14.5s\n", + "[CV] C=100, gamma=0.001, kernel=rbf ..................................\n", + "[CV] ...... C=100, gamma=0.001, kernel=rbf, score=0.948, total= 18.3s\n", + "[CV] C=100, gamma=0.001, kernel=rbf ..................................\n", + "[CV] ...... C=100, gamma=0.001, kernel=rbf, score=0.943, total= 17.7s\n", + "[CV] C=100, gamma=0.001, kernel=rbf ..................................\n", + "[CV] ...... C=100, gamma=0.001, kernel=rbf, score=0.946, total= 18.1s\n", + "[CV] C=100, gamma=0.0001, kernel=rbf .................................\n", + "[CV] ..... C=100, gamma=0.0001, kernel=rbf, score=0.419, total= 29.7s\n", + "[CV] C=100, gamma=0.0001, kernel=rbf .................................\n", + "[CV] ..... C=100, gamma=0.0001, kernel=rbf, score=0.421, total= 28.2s\n", + "[CV] C=100, gamma=0.0001, kernel=rbf .................................\n", + "[CV] ..... C=100, gamma=0.0001, kernel=rbf, score=0.420, total= 29.2s\n", + "[CV] C=1000, gamma=1, kernel=rbf .....................................\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CV] ......... C=1000, gamma=1, kernel=rbf, score=0.963, total= 28.3s\n", + "[CV] C=1000, gamma=1, kernel=rbf .....................................\n", + "[CV] ......... C=1000, gamma=1, kernel=rbf, score=0.958, total= 27.6s\n", + "[CV] C=1000, gamma=1, kernel=rbf .....................................\n", + "[CV] ......... C=1000, gamma=1, kernel=rbf, score=0.964, total= 28.6s\n", + "[CV] C=1000, gamma=0.1, kernel=rbf ...................................\n", + "[CV] ....... C=1000, gamma=0.1, kernel=rbf, score=0.972, total= 15.1s\n", + "[CV] C=1000, gamma=0.1, kernel=rbf ...................................\n", + "[CV] ....... C=1000, gamma=0.1, kernel=rbf, score=0.964, total= 15.9s\n", + "[CV] C=1000, gamma=0.1, kernel=rbf ...................................\n", + "[CV] ....... C=1000, gamma=0.1, kernel=rbf, score=0.971, total= 15.3s\n", + "[CV] C=1000, gamma=0.01, kernel=rbf ..................................\n", + "[CV] ...... C=1000, gamma=0.01, kernel=rbf, score=0.973, total= 14.4s\n", + "[CV] C=1000, gamma=0.01, kernel=rbf ..................................\n", + "[CV] ...... C=1000, gamma=0.01, kernel=rbf, score=0.963, total= 14.1s\n", + "[CV] C=1000, gamma=0.01, kernel=rbf ..................................\n", + "[CV] ...... C=1000, gamma=0.01, kernel=rbf, score=0.970, total= 14.4s\n", + "[CV] C=1000, gamma=0.001, kernel=rbf .................................\n", + "[CV] ..... C=1000, gamma=0.001, kernel=rbf, score=0.972, total= 14.1s\n", + "[CV] C=1000, gamma=0.001, kernel=rbf .................................\n", + "[CV] ..... C=1000, gamma=0.001, kernel=rbf, score=0.965, total= 14.2s\n", + "[CV] C=1000, gamma=0.001, kernel=rbf .................................\n", + "[CV] ..... C=1000, gamma=0.001, kernel=rbf, score=0.971, total= 14.4s\n", + "[CV] C=1000, gamma=0.0001, kernel=rbf ................................\n", + "[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.948, total= 18.2s\n", + "[CV] C=1000, gamma=0.0001, kernel=rbf ................................\n", + "[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.943, total= 18.5s\n", + "[CV] C=1000, gamma=0.0001, kernel=rbf ................................\n", + "[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.946, total= 18.7s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=1)]: Done 75 out of 75 | elapsed: 29.5min finished\n" + ] + }, + { + "data": { + "text/plain": [ + "Pipeline(memory=None,\n", + " steps=[('bow',\n", + " CountVectorizer(analyzer=,\n", + " binary=False, decode_error='strict',\n", + " dtype=, encoding='utf-8',\n", + " input='content', lowercase=True, max_df=1.0,\n", + " max_features=None, min_df=1,\n", + " ngram_range=(1, 1), preprocessor=None,\n", + " stop_words=None, strip_accents=None,\n", + " token_pattern='(?u)\\\\b\\\\w\\\\w...\n", + " decision_function_shape='ovr',\n", + " degree=3, gamma='auto_deprecated',\n", + " kernel='rbf', max_iter=-1,\n", + " probability=False,\n", + " random_state=None, shrinking=True,\n", + " tol=0.001, verbose=False),\n", + " iid='warn', n_jobs=None,\n", + " param_grid={'C': [0.1, 1, 10, 100, 1000],\n", + " 'gamma': [1, 0.1, 0.01, 0.001,\n", + " 0.0001],\n", + " 'kernel': ['rbf']},\n", + " pre_dispatch='2*n_jobs', refit=True,\n", + " return_train_score=False, scoring=None,\n", + " verbose=3))],\n", + " verbose=False)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.fit(train['STORY'],train['SECTION'])" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SECTION
01
12
21
31
41
51
61
72
81
92
100
113
122
131
142
151
163
172
183
192
202
212
222
230
240
252
262
273
283
290
......
27180
27192
27203
27211
27220
27231
27242
27252
27261
27272
27281
27291
27303
27312
27323
27331
27340
27353
27360
27371
27381
27391
27403
27410
27420
27431
27441
27451
27460
27471
\n", + "

2748 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " SECTION\n", + "0 1\n", + "1 2\n", + "2 1\n", + "3 1\n", + "4 1\n", + "5 1\n", + "6 1\n", + "7 2\n", + "8 1\n", + "9 2\n", + "10 0\n", + "11 3\n", + "12 2\n", + "13 1\n", + "14 2\n", + "15 1\n", + "16 3\n", + "17 2\n", + "18 3\n", + "19 2\n", + "20 2\n", + "21 2\n", + "22 2\n", + "23 0\n", + "24 0\n", + "25 2\n", + "26 2\n", + "27 3\n", + "28 3\n", + "29 0\n", + "... ...\n", + "2718 0\n", + "2719 2\n", + "2720 3\n", + "2721 1\n", + "2722 0\n", + "2723 1\n", + "2724 2\n", + "2725 2\n", + "2726 1\n", + "2727 2\n", + "2728 1\n", + "2729 1\n", + "2730 3\n", + "2731 2\n", + "2732 3\n", + "2733 1\n", + "2734 0\n", + "2735 3\n", + "2736 0\n", + "2737 1\n", + "2738 1\n", + "2739 1\n", + "2740 3\n", + "2741 0\n", + "2742 0\n", + "2743 1\n", + "2744 1\n", + "2745 1\n", + "2746 0\n", + "2747 1\n", + "\n", + "[2748 rows x 1 columns]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predictions = pipeline.predict(test['STORY'])\n", + "output_svm_quote = pd.DataFrame(predictions,columns=['SECTION'])\n", + "output_svm_quote" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SECTION
01
12
21
31
41
51
61
72
81
92
100
113
122
131
142
151
163
172
183
192
202
212
222
230
240
252
262
273
283
290
......
27180
27192
27203
27211
27220
27231
27242
27252
27261
27272
27281
27291
27303
27311
27323
27331
27340
27353
27360
27371
27381
27391
27403
27410
27420
27431
27441
27451
27460
27471
\n", + "

2748 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " SECTION\n", + "0 1\n", + "1 2\n", + "2 1\n", + "3 1\n", + "4 1\n", + "5 1\n", + "6 1\n", + "7 2\n", + "8 1\n", + "9 2\n", + "10 0\n", + "11 3\n", + "12 2\n", + "13 1\n", + "14 2\n", + "15 1\n", + "16 3\n", + "17 2\n", + "18 3\n", + "19 2\n", + "20 2\n", + "21 2\n", + "22 2\n", + "23 0\n", + "24 0\n", + "25 2\n", + "26 2\n", + "27 3\n", + "28 3\n", + "29 0\n", + "... ...\n", + "2718 0\n", + "2719 2\n", + "2720 3\n", + "2721 1\n", + "2722 0\n", + "2723 1\n", + "2724 2\n", + "2725 2\n", + "2726 1\n", + "2727 2\n", + "2728 1\n", + "2729 1\n", + "2730 3\n", + "2731 1\n", + "2732 3\n", + "2733 1\n", + "2734 0\n", + "2735 3\n", + "2736 0\n", + "2737 1\n", + "2738 1\n", + "2739 1\n", + "2740 3\n", + "2741 0\n", + "2742 0\n", + "2743 1\n", + "2744 1\n", + "2745 1\n", + "2746 0\n", + "2747 1\n", + "\n", + "[2748 rows x 1 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predictions = pipeline.predict(test['STORY'])\n", + "output_svm = pd.DataFrame(predictions,columns=['SECTION'])\n", + "output_svm" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "output_svm_quote.to_excel('output_svm_quote.xlsx',sheet_name='Sheet1',index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "output_svm_skim.to_excel('output_svm_skim.xlsx',sheet_name='Sheet1',index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['sadva', 'vavb', 'is', 'what']" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "token = ['sadva','vavb','\"',\"'\",'`','is','what']\n", + "[word for word in token if word.lower() not in (stopwords.words('english') and ['\"',\"'\",'`','”','“'])]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/News_category_project/Data_Test.xlsx b/News_category_project/Data_Test.xlsx new file mode 100644 index 0000000..fd65f62 Binary files /dev/null and b/News_category_project/Data_Test.xlsx differ diff --git a/News_category_project/Data_Train.xlsx b/News_category_project/Data_Train.xlsx new file mode 100644 index 0000000..5f65921 Binary files /dev/null and b/News_category_project/Data_Train.xlsx differ diff --git a/News_category_project/News_cat.ipynb b/News_category_project/News_cat.ipynb new file mode 100644 index 0000000..fc54971 --- /dev/null +++ b/News_category_project/News_cat.ipynb @@ -0,0 +1,2856 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# News Category Finder using NLP\n", + "\n", + "\n", + "Please check the test, train and sample submission files\n", + "#### Importing Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import nltk" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "train = pd.read_excel('Data_Train.xlsx', sheet_name='Sheet1')\n", + "test = pd.read_excel('Data_Test.xlsx', sheet_name='Sheet1')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
STORYSECTION
0But the most painful was the huge reversal in ...3
1How formidable is the opposition alliance amon...0
2Most Asian currencies were trading lower today...3
3If you want to answer any question, click on ‘...1
4In global markets, gold prices edged up today ...3
\n", + "
" + ], + "text/plain": [ + " STORY SECTION\n", + "0 But the most painful was the huge reversal in ... 3\n", + "1 How formidable is the opposition alliance amon... 0\n", + "2 Most Asian currencies were trading lower today... 3\n", + "3 If you want to answer any question, click on ‘... 1\n", + "4 In global markets, gold prices edged up today ... 3" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
STORY
02019 will see gadgets like gaming smartphones ...
1It has also unleashed a wave of changes in the...
2It can be confusing to pick the right smartpho...
3The mobile application is integrated with a da...
4We have rounded up some of the gadgets that sh...
\n", + "
" + ], + "text/plain": [ + " STORY\n", + "0 2019 will see gadgets like gaming smartphones ...\n", + "1 It has also unleashed a wave of changes in the...\n", + "2 It can be confusing to pick the right smartpho...\n", + "3 The mobile application is integrated with a da...\n", + "4 We have rounded up some of the gadgets that sh..." + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SECTION
count7628.000000
mean1.357892
std0.999341
min0.000000
25%1.000000
50%1.000000
75%2.000000
max3.000000
\n", + "
" + ], + "text/plain": [ + " SECTION\n", + "count 7628.000000\n", + "mean 1.357892\n", + "std 0.999341\n", + "min 0.000000\n", + "25% 1.000000\n", + "50% 1.000000\n", + "75% 2.000000\n", + "max 3.000000" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
STORY
countuniquetopfreq
SECTION
016861673This story has been published from a wire agen...4
127722731This story has been published from a wire agen...13
219241914The consensus reads, “Exciting, entertaining, ...3
312461233This story has been published from a wire agen...11
\n", + "
" + ], + "text/plain": [ + " STORY \n", + " count unique top freq\n", + "SECTION \n", + "0 1686 1673 This story has been published from a wire agen... 4\n", + "1 2772 2731 This story has been published from a wire agen... 13\n", + "2 1924 1914 The consensus reads, “Exciting, entertaining, ... 3\n", + "3 1246 1233 This story has been published from a wire agen... 11" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train.groupby('SECTION').describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Tokenize" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from nltk.tokenize import word_tokenize" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "from nltk.corpus import stopwords" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "running\n", + "run\n" + ] + } + ], + "source": [ + "from nltk.stem.wordnet import WordNetLemmatizer \n", + "lem = WordNetLemmatizer()\n", + "\n", + "from nltk.stem.porter import PorterStemmer \n", + "stem = PorterStemmer()\n", + "\n", + "word = \"running\" \n", + "print(lem.lemmatize(word))\n", + "\n", + "print(stem.stem(word))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "from nltk.stem.wordnet import WordNetLemmatizer \n", + "from nltk.stem.porter import PorterStemmer \n", + "lem = WordNetLemmatizer()\n", + "stem = PorterStemmer()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Processing Tokens" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import string\n", + "\n", + "def text_process(mess):\n", + " \"\"\"\n", + " Takes in a string of text, then performs the following:\n", + " 1. Remove all punctuation\n", + " 2. Remove all stopwords\n", + " 3. Returns a list of the cleaned text\n", + " \"\"\"\n", + " # Check characters to see if they are in punctuation\n", + " nopunc = [char for char in mess if char not in string.punctuation]\n", + "\n", + " # Join the characters again to form the string.\n", + " nopunc = ''.join(nopunc)\n", + " \n", + " token = word_tokenize(nopunc)\n", + " \n", + " # Now just remove any stopwords\n", + " no_noise = [word for word in token if word.lower() not in (stopwords.words('english') and ['\"',\"'\",'`','”','“'])]\n", + " # Stemming\n", + " \n", + " \n", + " return no_noise\n", + "#[stem.stem(word.lower()) for word in no_noise]\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 [painful, huge, reversal, fee, income, unheard...\n", + "1 [formidable, opposition, alliance, among, Cong...\n", + "2 [Asian, currencies, trading, lower, today, Sou...\n", + "3 [want, answer, question, click, ‘, Answer, ’, ...\n", + "4 [global, markets, gold, prices, edged, today, ...\n", + "5 [BEIJING, Chinese, tech, giant, Huawei, announ...\n", + "6 [Mumbai, India, Incs, external, commercial, bo...\n", + "7 [Wednesday, Federal, Reserve, Chairman, Jerome...\n", + "8 [give, audience, already, done, Yeh, Hai, Aash...\n", + "9 [com, Arbaaz, Khan, spoke, getting, back, Daba...\n", + "10 [“, One, would, think, development, testing, p...\n", + "11 [far, year, rupee, gained, 07, foreign, invest...\n", + "12 [Xiaomi, however, sees, presence, Jio, rural, ...\n", + "13 [ad, reads, bells, whistles, Bezel, notch, app...\n", + "14 [Tuesday, Powell, said, healthy, US, economy, ...\n", + "15 [feature, help, make, display, responsive, int...\n", + "16 [TikTok, popular, among, children, facing, cri...\n", + "17 [company, hive, ratings, business, whollyowned...\n", + "18 [chooses, hide, CP, colleagues, move, mother, ...\n", + "19 [’, right, opera, house, simply, goes, show, A...\n", + "20 [Facebook, said, eligible, creators, would, ab...\n", + "21 [Starring, Varun, Dhawan, Alia, Bhatt, Sonaksh...\n", + "22 [GKN, Securities, barred, misuse, socalled, da...\n", + "23 [Fintech, startup, Zeta, cofounded, Bhavin, Tu...\n", + "24 [story, published, wire, agency, feed, without...\n", + "25 [Globally, established, companies, Stratasys, ...\n", + "26 [statements, Yeddyurappa, says, air, strikes, ...\n", + "27 [NDA, seeks, reelection, agriculture, form, im...\n", + "28 [Yeddyurappa, said, IAF, air, strikes, would, ...\n", + "29 [”, two, releases, year, far, Milan, Talkies, ...\n", + " ... \n", + "7598 [“, TDP, party, ideology, gotten, sidelined, e...\n", + "7599 [far, whenever, ’, reviewed, Kindles, Paperwhi...\n", + "7600 [day, markets, saw, high, volatility, followin...\n", + "7601 [Today, Gmail, allows, 15GB, free, storage, Us...\n", + "7602 [Aparajita, Sarangi, took, voluntary, retireme...\n", + "7603 [Investors, awaiting, economic, growth, data, ...\n", + "7604 [advice, online, survivalists, moves, unpopula...\n", + "7605 [time, developers, disguising, app, pretend, c...\n", + "7606 [Lok, Sabha, elections, 2019, Fifth, phase, vo...\n", + "7607 [watchdog, passed, five, separate, orders, tog...\n", + "7608 [Twitter, post, last, week, OnePlus, confirmed...\n", + "7609 [iOSonly, email, client, named, Spark, launche...\n", + "7610 [said, question, really, spend, ₹10000, portab...\n", + "7611 [’, want, kind, movie, felt, done, similar, ki...\n", + "7612 [1999, film, Mother, received, Best, Foreign, ...\n", + "7613 [Mohan, Babu, considered, political, heavyweig...\n", + "7614 [SP, 500, opened, higher, 126, points, 004, 28...\n", + "7615 [However, reports, suggest, would, another, sm...\n", + "7616 [Mumbai, Indian, stocks, rose, key, indices, e...\n", + "7617 [Sure, others, slightly, faster, slightly, sha...\n", + "7618 [147, million, pixels, one, billion, colours, ...\n", + "7619 [BJD, supporter, Puri, Congress, leaders, well...\n", + "7620 [Bollywood, celebrities, took, social, media, ...\n", + "7621 [However, confirmation, developers, games, wou...\n", + "7622 [terms, optics, back, Redmi, Note, 7, boasts, ...\n", + "7623 [Karnataka, Congress, bastion, also, gave, BJP...\n", + "7624 [film, also, features, Janhvi, Kapoor, revolve...\n", + "7625 [database, created, bringing, together, crimin...\n", + "7626 [state, uneasy, relationship, mainland, since,...\n", + "7627 [Virus, stars, Kunchacko, Boban, Tovino, Thoma...\n", + "Name: STORY, Length: 7628, dtype: object" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train['STORY'].apply(text_process)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Bag of words" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import CountVectorizer" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "bow_transformer = CountVectorizer(analyzer=text_process).fit(train['STORY'])" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "44346\n" + ] + } + ], + "source": [ + "print(len(bow_transformer.vocabulary_))" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " (0, 98)\t1\n", + " (0, 458)\t1\n", + " (0, 4354)\t1\n", + " (0, 6127)\t1\n", + " (0, 7903)\t1\n", + " (0, 17114)\t1\n", + " (0, 19913)\t1\n", + " (0, 20711)\t1\n", + " (0, 21236)\t1\n", + " (0, 22284)\t1\n", + " (0, 23796)\t1\n", + " (0, 24786)\t1\n", + " (0, 25194)\t1\n", + " (0, 25607)\t1\n", + " (0, 26393)\t1\n", + " (0, 26400)\t1\n", + " (0, 26889)\t1\n", + " (0, 27416)\t1\n", + " (0, 28063)\t1\n", + " (0, 28740)\t2\n", + " (0, 28793)\t2\n", + " (0, 29335)\t1\n", + " (0, 32289)\t3\n", + " (0, 33750)\t1\n", + " (0, 34213)\t1\n", + " (0, 34771)\t1\n", + " (0, 35747)\t1\n", + " (0, 37546)\t1\n", + " (0, 37647)\t1\n", + " (0, 39826)\t1\n", + " (0, 41132)\t2\n", + " (0, 42868)\t1\n", + "(1, 44346)\n" + ] + } + ], + "source": [ + "bow4 = bow_transformer.transform([train['STORY'][4]])\n", + "print(bow4) # vectors pointing from origin\n", + "print(bow4.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "markets\n", + "stock\n" + ] + } + ], + "source": [ + "print(bow_transformer.get_feature_names()[32289])\n", + "print(bow_transformer.get_feature_names()[39826])" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "messages_bow = bow_transformer.transform(train['STORY'])" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shape of Sparse Matrix: (7628, 44346)\n", + "Amount of Non-Zero occurences: 417825\n" + ] + } + ], + "source": [ + "print('Shape of Sparse Matrix: ', messages_bow.shape)\n", + "print('Amount of Non-Zero occurences: ', messages_bow.nnz)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sparsity: 0.12351772521704532\n" + ] + } + ], + "source": [ + "sparsity = (100.0 * messages_bow.nnz / (messages_bow.shape[0] * messages_bow.shape[1]))\n", + "\n", + "print(f'sparsity: {sparsity}')" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " (0, 42868)\t0.1653840117317503\n", + " (0, 41132)\t0.23264193296874802\n", + " (0, 39826)\t0.12464706628982292\n", + " (0, 37647)\t0.13059926000708844\n", + " (0, 37546)\t0.13533155309101896\n", + " (0, 35747)\t0.11016931263373622\n", + " (0, 34771)\t0.11197548409940615\n", + " (0, 34213)\t0.17507399610069765\n", + " (0, 33750)\t0.2329734713609702\n", + " (0, 32289)\t0.3411710195006078\n", + " (0, 29335)\t0.14164721721626908\n", + " (0, 28793)\t0.2878801405286177\n", + " (0, 28740)\t0.21695807780745277\n", + " (0, 28063)\t0.1342731302228393\n", + " (0, 27416)\t0.1771020108713445\n", + " (0, 26889)\t0.13365789947306533\n", + " (0, 26400)\t0.20124735910228528\n", + " (0, 26393)\t0.1288535100738698\n", + " (0, 25607)\t0.18575848290189373\n", + " (0, 25194)\t0.20712745650428582\n", + " (0, 24786)\t0.09579350098100424\n", + " (0, 23796)\t0.130962730415073\n", + " (0, 22284)\t0.14220199953463913\n", + " (0, 21236)\t0.09531611135230537\n", + " (0, 20711)\t0.18575848290189373\n", + " (0, 19913)\t0.14959653531880082\n", + " (0, 17114)\t0.17929921522347367\n", + " (0, 7903)\t0.15233170401746512\n", + " (0, 6127)\t0.11539264864412509\n", + " (0, 4354)\t0.14394007026430886\n", + " (0, 458)\t0.2436579581621663\n", + " (0, 98)\t0.16899942411275612\n" + ] + } + ], + "source": [ + "from sklearn.feature_extraction.text import TfidfTransformer\n", + "\n", + "tfidf_transformer = TfidfTransformer().fit(messages_bow)\n", + "\n", + "# TEST\n", + "tfidf4 = tfidf_transformer.transform(bow4)\n", + "print(tfidf4)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "7.74248747675401\n", + "8.33027414165613\n" + ] + } + ], + "source": [ + "print(tfidf_transformer.idf_[bow_transformer.vocabulary_['u']])\n", + "print(tfidf_transformer.idf_[bow_transformer.vocabulary_['university']])" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(7628, 44346)\n" + ] + } + ], + "source": [ + "messages_tfidf = tfidf_transformer.transform(messages_bow)\n", + "print(messages_tfidf.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<7628x44346 sparse matrix of type ''\n", + "\twith 417825 stored elements in Compressed Sparse Row format>" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "messages_tfidf" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "#classifier\n", + "from sklearn.naive_bayes import MultinomialNB \n", + "category_detect_model = MultinomialNB().fit(messages_tfidf, train['SECTION'])" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "predicted: 3\n", + "expected: 1\n" + ] + } + ], + "source": [ + "print('predicted:', category_detect_model.predict(tfidf4)[0])\n", + "print('expected:', train['SECTION'][3])" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[3 0 3 ... 1 0 2]\n" + ] + } + ], + "source": [ + "all_predictions = category_detect_model.predict(messages_tfidf)\n", + "print(all_predictions)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.97 0.97 0.97 1686\n", + " 1 0.95 1.00 0.97 2772\n", + " 2 1.00 0.96 0.98 1924\n", + " 3 1.00 0.95 0.97 1246\n", + "\n", + " accuracy 0.97 7628\n", + " macro avg 0.98 0.97 0.97 7628\n", + "weighted avg 0.97 0.97 0.97 7628\n", + "\n" + ] + } + ], + "source": [ + "from sklearn.metrics import classification_report\n", + "print (classification_report(train['SECTION'], all_predictions))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.pipeline import Pipeline\n", + "\n", + "pipeline = Pipeline([\n", + " ('bow', CountVectorizer(analyzer=text_process)), # strings to token integer counts\n", + " ('tfidf', TfidfTransformer()), # integer counts to weighted TF-IDF scores\n", + " ('classifier', MultinomialNB()), # train on TF-IDF vectors w/ Naive Bayes classifier\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Pipeline(memory=None,\n", + " steps=[('bow',\n", + " CountVectorizer(analyzer=,\n", + " binary=False, decode_error='strict',\n", + " dtype=, encoding='utf-8',\n", + " input='content', lowercase=True, max_df=1.0,\n", + " max_features=None, min_df=1,\n", + " ngram_range=(1, 1), preprocessor=None,\n", + " stop_words=None, strip_accents=None,\n", + " token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n", + " tokenizer=None, vocabulary=None)),\n", + " ('tfidf',\n", + " TfidfTransformer(norm='l2', smooth_idf=True,\n", + " sublinear_tf=False, use_idf=True)),\n", + " ('classifier',\n", + " MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],\n", + " verbose=False)" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.fit(train['STORY'],train['SECTION'])" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1 2 1 ... 1 0 1]\n" + ] + } + ], + "source": [ + "predictions = pipeline.predict(test['STORY'])\n", + "print(predictions)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SECTION
01
12
21
31
41
51
61
72
81
92
100
113
122
131
142
151
161
172
183
192
202
211
222
230
240
252
262
273
283
290
......
27180
27192
27203
27211
27220
27231
27242
27250
27261
27272
27281
27291
27303
27311
27323
27331
27340
27353
27360
27371
27381
27391
27403
27410
27420
27431
27441
27451
27460
27471
\n", + "

2748 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " SECTION\n", + "0 1\n", + "1 2\n", + "2 1\n", + "3 1\n", + "4 1\n", + "5 1\n", + "6 1\n", + "7 2\n", + "8 1\n", + "9 2\n", + "10 0\n", + "11 3\n", + "12 2\n", + "13 1\n", + "14 2\n", + "15 1\n", + "16 1\n", + "17 2\n", + "18 3\n", + "19 2\n", + "20 2\n", + "21 1\n", + "22 2\n", + "23 0\n", + "24 0\n", + "25 2\n", + "26 2\n", + "27 3\n", + "28 3\n", + "29 0\n", + "... ...\n", + "2718 0\n", + "2719 2\n", + "2720 3\n", + "2721 1\n", + "2722 0\n", + "2723 1\n", + "2724 2\n", + "2725 0\n", + "2726 1\n", + "2727 2\n", + "2728 1\n", + "2729 1\n", + "2730 3\n", + "2731 1\n", + "2732 3\n", + "2733 1\n", + "2734 0\n", + "2735 3\n", + "2736 0\n", + "2737 1\n", + "2738 1\n", + "2739 1\n", + "2740 3\n", + "2741 0\n", + "2742 0\n", + "2743 1\n", + "2744 1\n", + "2745 1\n", + "2746 0\n", + "2747 1\n", + "\n", + "[2748 rows x 1 columns]" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output = pd.DataFrame(predictions,columns=['SECTION'])\n", + "output" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "output.to_excel('output.xlsx',sheet_name='Sheet1',index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SECTIONSTORY
012019 will see gadgets like gaming smartphones ...
12It has also unleashed a wave of changes in the...
21It can be confusing to pick the right smartpho...
31The mobile application is integrated with a da...
41We have rounded up some of the gadgets that sh...
51\"Imagine if every message you sent was kept wi...
61Positioned along the four sides of the Asus RO...
72In fact, when I applied to USC film school the...
81As spotted by Android Police, Netflix is testi...
92Her moves were immaculately choreographed as s...
100The NCP leadership was under tremendous pressu...
113On the traded volume front, 48.67 lakh shares ...
122They wrote, “Welcoming Makkal Selvan Vijay Set...
131The back of the phone features a 12MP+13MP AI ...
142Talking about how the pressure got to her at o...
151The Xiaomi Play is expected to have a CPU runn...
161In January 2019, the telecom industry added 21...
172Apart from the finalists, the grand finale als...
183\"We expect (a) slew of REIT IPOs to hit the ma...
192According to the same report, Avengers Endgame...
202“Wounded by Lannister riders, they will seek r...
211With so much hatred around, through the show, ...
222The Force Awakens, incidentally, wind up as th...
230The two have been taking potshots at each othe...
240Thakur said “they (authorities) wanted to forc...
252“It’s seriously delightful that our new Dracul...
262(Photo: Diljit Dosanjh/Instagram)Diljit Dosanj...
273Some respite came from the 6% growth in utilit...
283The outlook for India's rupee has deteriorated...
290However, political analysts said that it will ...
.........
27180The party renominated its sitting MP Ranjet Ra...
27192Was there a moment you were bullied?I was very...
27203In Delhi, gold of 99.9% and 99.5% purities fel...
27211Snapchat's controversial and criticised redesi...
27220Of the eight states that faced single-phase po...
27231Facebook pulled 513 Pages, Groups and accounts...
27242One of the most popular female actors in the 1...
27250Two Telugu TV actresses died in a road acciden...
27261Google CEO Sundar Pichai introduced Duplex ear...
27272”Priyanka was replaced by Katrina Kaif, who wo...
27281The Chinese smartphone manufacturer today intr...
27291Samsung, the world’s largest smartphone maker,...
27303Reliance Securities has revised TCS target pri...
27311Music streaming may not be novel, but it’s sti...
27323Ericsson India Pvt. Ltd had moved the Supreme ...
27331On the other hand, the hotel and airline loyal...
27340The Narendra Modi government simply finished t...
27353The yield on 10-year Treasuries climbed two ba...
27360Section 126 of the Representation of People Ac...
27371With these techniques, machines are also learn...
27381The traffic challan payment is also restricted...
27391“We are not like Western countries, where peop...
27403Further, SBICAP Securities says delivery of th...
27410Two, Raj Thackeray’s Maharashtra Navnirman Sen...
27420Senior leaders of the BJP are using the sugges...
27431According to researchers, fraud in the mobile ...
27441The iPhone XS and XS Max share the Apple A12 c...
27451On the photography front, the Note 5 Pro featu...
27460UDAY mandated that discoms bring the gap betwe...
27471Ripple also helps bank customers send money to...
\n", + "

2748 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " SECTION STORY\n", + "0 1 2019 will see gadgets like gaming smartphones ...\n", + "1 2 It has also unleashed a wave of changes in the...\n", + "2 1 It can be confusing to pick the right smartpho...\n", + "3 1 The mobile application is integrated with a da...\n", + "4 1 We have rounded up some of the gadgets that sh...\n", + "5 1 \"Imagine if every message you sent was kept wi...\n", + "6 1 Positioned along the four sides of the Asus RO...\n", + "7 2 In fact, when I applied to USC film school the...\n", + "8 1 As spotted by Android Police, Netflix is testi...\n", + "9 2 Her moves were immaculately choreographed as s...\n", + "10 0 The NCP leadership was under tremendous pressu...\n", + "11 3 On the traded volume front, 48.67 lakh shares ...\n", + "12 2 They wrote, “Welcoming Makkal Selvan Vijay Set...\n", + "13 1 The back of the phone features a 12MP+13MP AI ...\n", + "14 2 Talking about how the pressure got to her at o...\n", + "15 1 The Xiaomi Play is expected to have a CPU runn...\n", + "16 1 In January 2019, the telecom industry added 21...\n", + "17 2 Apart from the finalists, the grand finale als...\n", + "18 3 \"We expect (a) slew of REIT IPOs to hit the ma...\n", + "19 2 According to the same report, Avengers Endgame...\n", + "20 2 “Wounded by Lannister riders, they will seek r...\n", + "21 1 With so much hatred around, through the show, ...\n", + "22 2 The Force Awakens, incidentally, wind up as th...\n", + "23 0 The two have been taking potshots at each othe...\n", + "24 0 Thakur said “they (authorities) wanted to forc...\n", + "25 2 “It’s seriously delightful that our new Dracul...\n", + "26 2 (Photo: Diljit Dosanjh/Instagram)Diljit Dosanj...\n", + "27 3 Some respite came from the 6% growth in utilit...\n", + "28 3 The outlook for India's rupee has deteriorated...\n", + "29 0 However, political analysts said that it will ...\n", + "... ... ...\n", + "2718 0 The party renominated its sitting MP Ranjet Ra...\n", + "2719 2 Was there a moment you were bullied?I was very...\n", + "2720 3 In Delhi, gold of 99.9% and 99.5% purities fel...\n", + "2721 1 Snapchat's controversial and criticised redesi...\n", + "2722 0 Of the eight states that faced single-phase po...\n", + "2723 1 Facebook pulled 513 Pages, Groups and accounts...\n", + "2724 2 One of the most popular female actors in the 1...\n", + "2725 0 Two Telugu TV actresses died in a road acciden...\n", + "2726 1 Google CEO Sundar Pichai introduced Duplex ear...\n", + "2727 2 ”Priyanka was replaced by Katrina Kaif, who wo...\n", + "2728 1 The Chinese smartphone manufacturer today intr...\n", + "2729 1 Samsung, the world’s largest smartphone maker,...\n", + "2730 3 Reliance Securities has revised TCS target pri...\n", + "2731 1 Music streaming may not be novel, but it’s sti...\n", + "2732 3 Ericsson India Pvt. Ltd had moved the Supreme ...\n", + "2733 1 On the other hand, the hotel and airline loyal...\n", + "2734 0 The Narendra Modi government simply finished t...\n", + "2735 3 The yield on 10-year Treasuries climbed two ba...\n", + "2736 0 Section 126 of the Representation of People Ac...\n", + "2737 1 With these techniques, machines are also learn...\n", + "2738 1 The traffic challan payment is also restricted...\n", + "2739 1 “We are not like Western countries, where peop...\n", + "2740 3 Further, SBICAP Securities says delivery of th...\n", + "2741 0 Two, Raj Thackeray’s Maharashtra Navnirman Sen...\n", + "2742 0 Senior leaders of the BJP are using the sugges...\n", + "2743 1 According to researchers, fraud in the mobile ...\n", + "2744 1 The iPhone XS and XS Max share the Apple A12 c...\n", + "2745 1 On the photography front, the Note 5 Pro featu...\n", + "2746 0 UDAY mandated that discoms bring the gap betwe...\n", + "2747 1 Ripple also helps bank customers send money to...\n", + "\n", + "[2748 rows x 2 columns]" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output['STORY'] = test['STORY']\n", + "output" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.svm import SVC\n", + "from sklearn.feature_extraction.text import TfidfTransformer" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']}\n", + "from sklearn.model_selection import GridSearchCV\n", + "grid = GridSearchCV(SVC(),param_grid,verbose=3)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.pipeline import Pipeline\n", + "\n", + "pipeline = Pipeline([\n", + " ('bow', CountVectorizer(analyzer=text_process)), # strings to token integer counts\n", + " ('tfidf', TfidfTransformer()), # integer counts to weighted TF-IDF scores\n", + " ('classifier', grid), # train on TF-IDF vectors w/ Naive Bayes classifier\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\Subham\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_split.py:1978: FutureWarning: The default value of cv will change from 3 to 5 in version 0.22. Specify it explicitly to silence this warning.\n", + " warnings.warn(CV_WARNING, FutureWarning)\n", + "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting 3 folds for each of 25 candidates, totalling 75 fits\n", + "[CV] C=0.1, gamma=1, kernel=rbf ......................................\n", + "[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.561, total= 30.1s\n", + "[CV] C=0.1, gamma=1, kernel=rbf ......................................\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 30.0s remaining: 0.0s\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.561, total= 30.8s\n", + "[CV] C=0.1, gamma=1, kernel=rbf ......................................\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=1)]: Done 2 out of 2 | elapsed: 1.0min remaining: 0.0s\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.560, total= 29.4s\n", + "[CV] C=0.1, gamma=0.1, kernel=rbf ....................................\n", + "[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.391, total= 29.9s\n", + "[CV] C=0.1, gamma=0.1, kernel=rbf ....................................\n", + "[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.385, total= 29.3s\n", + "[CV] C=0.1, gamma=0.1, kernel=rbf ....................................\n", + "[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.387, total= 29.3s\n", + "[CV] C=0.1, gamma=0.01, kernel=rbf ...................................\n", + "[CV] ....... C=0.1, gamma=0.01, kernel=rbf, score=0.363, total= 29.2s\n", + "[CV] C=0.1, gamma=0.01, kernel=rbf ...................................\n", + "[CV] ....... C=0.1, gamma=0.01, kernel=rbf, score=0.363, total= 28.8s\n", + "[CV] C=0.1, gamma=0.01, kernel=rbf ...................................\n", + "[CV] ....... C=0.1, gamma=0.01, kernel=rbf, score=0.363, total= 29.3s\n", + "[CV] C=0.1, gamma=0.001, kernel=rbf ..................................\n", + "[CV] ...... C=0.1, gamma=0.001, kernel=rbf, score=0.363, total= 28.4s\n", + "[CV] C=0.1, gamma=0.001, kernel=rbf ..................................\n", + "[CV] ...... C=0.1, gamma=0.001, kernel=rbf, score=0.363, total= 27.5s\n", + "[CV] C=0.1, gamma=0.001, kernel=rbf ..................................\n", + "[CV] ...... C=0.1, gamma=0.001, kernel=rbf, score=0.363, total= 27.8s\n", + "[CV] C=0.1, gamma=0.0001, kernel=rbf .................................\n", + "[CV] ..... C=0.1, gamma=0.0001, kernel=rbf, score=0.363, total= 27.6s\n", + "[CV] C=0.1, gamma=0.0001, kernel=rbf .................................\n", + "[CV] ..... C=0.1, gamma=0.0001, kernel=rbf, score=0.363, total= 27.5s\n", + "[CV] C=0.1, gamma=0.0001, kernel=rbf .................................\n", + "[CV] ..... C=0.1, gamma=0.0001, kernel=rbf, score=0.363, total= 28.6s\n", + "[CV] C=1, gamma=1, kernel=rbf ........................................\n", + "[CV] ............ C=1, gamma=1, kernel=rbf, score=0.961, total= 26.9s\n", + "[CV] C=1, gamma=1, kernel=rbf ........................................\n", + "[CV] ............ C=1, gamma=1, kernel=rbf, score=0.956, total= 26.6s\n", + "[CV] C=1, gamma=1, kernel=rbf ........................................\n", + "[CV] ............ C=1, gamma=1, kernel=rbf, score=0.962, total= 26.6s\n", + "[CV] C=1, gamma=0.1, kernel=rbf ......................................\n", + "[CV] .......... C=1, gamma=0.1, kernel=rbf, score=0.940, total= 18.9s\n", + "[CV] C=1, gamma=0.1, kernel=rbf ......................................\n", + "[CV] .......... C=1, gamma=0.1, kernel=rbf, score=0.937, total= 18.7s\n", + "[CV] C=1, gamma=0.1, kernel=rbf ......................................\n", + "[CV] .......... C=1, gamma=0.1, kernel=rbf, score=0.941, total= 19.0s\n", + "[CV] C=1, gamma=0.01, kernel=rbf .....................................\n", + "[CV] ......... C=1, gamma=0.01, kernel=rbf, score=0.416, total= 27.8s\n", + "[CV] C=1, gamma=0.01, kernel=rbf .....................................\n", + "[CV] ......... C=1, gamma=0.01, kernel=rbf, score=0.417, total= 27.7s\n", + "[CV] C=1, gamma=0.01, kernel=rbf .....................................\n", + "[CV] ......... C=1, gamma=0.01, kernel=rbf, score=0.417, total= 27.9s\n", + "[CV] C=1, gamma=0.001, kernel=rbf ....................................\n", + "[CV] ........ C=1, gamma=0.001, kernel=rbf, score=0.363, total= 28.2s\n", + "[CV] C=1, gamma=0.001, kernel=rbf ....................................\n", + "[CV] ........ C=1, gamma=0.001, kernel=rbf, score=0.363, total= 28.2s\n", + "[CV] C=1, gamma=0.001, kernel=rbf ....................................\n", + "[CV] ........ C=1, gamma=0.001, kernel=rbf, score=0.363, total= 28.4s\n", + "[CV] C=1, gamma=0.0001, kernel=rbf ...................................\n", + "[CV] ....... C=1, gamma=0.0001, kernel=rbf, score=0.363, total= 28.8s\n", + "[CV] C=1, gamma=0.0001, kernel=rbf ...................................\n", + "[CV] ....... C=1, gamma=0.0001, kernel=rbf, score=0.363, total= 28.8s\n", + "[CV] C=1, gamma=0.0001, kernel=rbf ...................................\n", + "[CV] ....... C=1, gamma=0.0001, kernel=rbf, score=0.363, total= 31.9s\n", + "[CV] C=10, gamma=1, kernel=rbf .......................................\n", + "[CV] ........... C=10, gamma=1, kernel=rbf, score=0.963, total= 31.7s\n", + "[CV] C=10, gamma=1, kernel=rbf .......................................\n", + "[CV] ........... C=10, gamma=1, kernel=rbf, score=0.958, total= 31.3s\n", + "[CV] C=10, gamma=1, kernel=rbf .......................................\n", + "[CV] ........... C=10, gamma=1, kernel=rbf, score=0.964, total= 28.3s\n", + "[CV] C=10, gamma=0.1, kernel=rbf .....................................\n", + "[CV] ......... C=10, gamma=0.1, kernel=rbf, score=0.972, total= 14.4s\n", + "[CV] C=10, gamma=0.1, kernel=rbf .....................................\n", + "[CV] ......... C=10, gamma=0.1, kernel=rbf, score=0.965, total= 14.5s\n", + "[CV] C=10, gamma=0.1, kernel=rbf .....................................\n", + "[CV] ......... C=10, gamma=0.1, kernel=rbf, score=0.970, total= 14.5s\n", + "[CV] C=10, gamma=0.01, kernel=rbf ....................................\n", + "[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.947, total= 18.0s\n", + "[CV] C=10, gamma=0.01, kernel=rbf ....................................\n", + "[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.942, total= 17.7s\n", + "[CV] C=10, gamma=0.01, kernel=rbf ....................................\n", + "[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.945, total= 18.1s\n", + "[CV] C=10, gamma=0.001, kernel=rbf ...................................\n", + "[CV] ....... C=10, gamma=0.001, kernel=rbf, score=0.419, total= 27.9s\n", + "[CV] C=10, gamma=0.001, kernel=rbf ...................................\n", + "[CV] ....... C=10, gamma=0.001, kernel=rbf, score=0.421, total= 27.9s\n", + "[CV] C=10, gamma=0.001, kernel=rbf ...................................\n", + "[CV] ....... C=10, gamma=0.001, kernel=rbf, score=0.420, total= 29.2s\n", + "[CV] C=10, gamma=0.0001, kernel=rbf ..................................\n", + "[CV] ...... C=10, gamma=0.0001, kernel=rbf, score=0.363, total= 28.3s\n", + "[CV] C=10, gamma=0.0001, kernel=rbf ..................................\n", + "[CV] ...... C=10, gamma=0.0001, kernel=rbf, score=0.363, total= 28.5s\n", + "[CV] C=10, gamma=0.0001, kernel=rbf ..................................\n", + "[CV] ...... C=10, gamma=0.0001, kernel=rbf, score=0.363, total= 28.2s\n", + "[CV] C=100, gamma=1, kernel=rbf ......................................\n", + "[CV] .......... C=100, gamma=1, kernel=rbf, score=0.963, total= 27.5s\n", + "[CV] C=100, gamma=1, kernel=rbf ......................................\n", + "[CV] .......... C=100, gamma=1, kernel=rbf, score=0.958, total= 27.2s\n", + "[CV] C=100, gamma=1, kernel=rbf ......................................\n", + "[CV] .......... C=100, gamma=1, kernel=rbf, score=0.964, total= 27.5s\n", + "[CV] C=100, gamma=0.1, kernel=rbf ....................................\n", + "[CV] ........ C=100, gamma=0.1, kernel=rbf, score=0.972, total= 14.2s\n", + "[CV] C=100, gamma=0.1, kernel=rbf ....................................\n", + "[CV] ........ C=100, gamma=0.1, kernel=rbf, score=0.964, total= 14.5s\n", + "[CV] C=100, gamma=0.1, kernel=rbf ....................................\n", + "[CV] ........ C=100, gamma=0.1, kernel=rbf, score=0.971, total= 14.6s\n", + "[CV] C=100, gamma=0.01, kernel=rbf ...................................\n", + "[CV] ....... C=100, gamma=0.01, kernel=rbf, score=0.972, total= 13.6s\n", + "[CV] C=100, gamma=0.01, kernel=rbf ...................................\n", + "[CV] ....... C=100, gamma=0.01, kernel=rbf, score=0.965, total= 13.7s\n", + "[CV] C=100, gamma=0.01, kernel=rbf ...................................\n", + "[CV] ....... C=100, gamma=0.01, kernel=rbf, score=0.970, total= 14.5s\n", + "[CV] C=100, gamma=0.001, kernel=rbf ..................................\n", + "[CV] ...... C=100, gamma=0.001, kernel=rbf, score=0.948, total= 18.3s\n", + "[CV] C=100, gamma=0.001, kernel=rbf ..................................\n", + "[CV] ...... C=100, gamma=0.001, kernel=rbf, score=0.943, total= 17.7s\n", + "[CV] C=100, gamma=0.001, kernel=rbf ..................................\n", + "[CV] ...... C=100, gamma=0.001, kernel=rbf, score=0.946, total= 18.1s\n", + "[CV] C=100, gamma=0.0001, kernel=rbf .................................\n", + "[CV] ..... C=100, gamma=0.0001, kernel=rbf, score=0.419, total= 29.7s\n", + "[CV] C=100, gamma=0.0001, kernel=rbf .................................\n", + "[CV] ..... C=100, gamma=0.0001, kernel=rbf, score=0.421, total= 28.2s\n", + "[CV] C=100, gamma=0.0001, kernel=rbf .................................\n", + "[CV] ..... C=100, gamma=0.0001, kernel=rbf, score=0.420, total= 29.2s\n", + "[CV] C=1000, gamma=1, kernel=rbf .....................................\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CV] ......... C=1000, gamma=1, kernel=rbf, score=0.963, total= 28.3s\n", + "[CV] C=1000, gamma=1, kernel=rbf .....................................\n", + "[CV] ......... C=1000, gamma=1, kernel=rbf, score=0.958, total= 27.6s\n", + "[CV] C=1000, gamma=1, kernel=rbf .....................................\n", + "[CV] ......... C=1000, gamma=1, kernel=rbf, score=0.964, total= 28.6s\n", + "[CV] C=1000, gamma=0.1, kernel=rbf ...................................\n", + "[CV] ....... C=1000, gamma=0.1, kernel=rbf, score=0.972, total= 15.1s\n", + "[CV] C=1000, gamma=0.1, kernel=rbf ...................................\n", + "[CV] ....... C=1000, gamma=0.1, kernel=rbf, score=0.964, total= 15.9s\n", + "[CV] C=1000, gamma=0.1, kernel=rbf ...................................\n", + "[CV] ....... C=1000, gamma=0.1, kernel=rbf, score=0.971, total= 15.3s\n", + "[CV] C=1000, gamma=0.01, kernel=rbf ..................................\n", + "[CV] ...... C=1000, gamma=0.01, kernel=rbf, score=0.973, total= 14.4s\n", + "[CV] C=1000, gamma=0.01, kernel=rbf ..................................\n", + "[CV] ...... C=1000, gamma=0.01, kernel=rbf, score=0.963, total= 14.1s\n", + "[CV] C=1000, gamma=0.01, kernel=rbf ..................................\n", + "[CV] ...... C=1000, gamma=0.01, kernel=rbf, score=0.970, total= 14.4s\n", + "[CV] C=1000, gamma=0.001, kernel=rbf .................................\n", + "[CV] ..... C=1000, gamma=0.001, kernel=rbf, score=0.972, total= 14.1s\n", + "[CV] C=1000, gamma=0.001, kernel=rbf .................................\n", + "[CV] ..... C=1000, gamma=0.001, kernel=rbf, score=0.965, total= 14.2s\n", + "[CV] C=1000, gamma=0.001, kernel=rbf .................................\n", + "[CV] ..... C=1000, gamma=0.001, kernel=rbf, score=0.971, total= 14.4s\n", + "[CV] C=1000, gamma=0.0001, kernel=rbf ................................\n", + "[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.948, total= 18.2s\n", + "[CV] C=1000, gamma=0.0001, kernel=rbf ................................\n", + "[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.943, total= 18.5s\n", + "[CV] C=1000, gamma=0.0001, kernel=rbf ................................\n", + "[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.946, total= 18.7s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=1)]: Done 75 out of 75 | elapsed: 29.5min finished\n" + ] + }, + { + "data": { + "text/plain": [ + "Pipeline(memory=None,\n", + " steps=[('bow',\n", + " CountVectorizer(analyzer=,\n", + " binary=False, decode_error='strict',\n", + " dtype=, encoding='utf-8',\n", + " input='content', lowercase=True, max_df=1.0,\n", + " max_features=None, min_df=1,\n", + " ngram_range=(1, 1), preprocessor=None,\n", + " stop_words=None, strip_accents=None,\n", + " token_pattern='(?u)\\\\b\\\\w\\\\w...\n", + " decision_function_shape='ovr',\n", + " degree=3, gamma='auto_deprecated',\n", + " kernel='rbf', max_iter=-1,\n", + " probability=False,\n", + " random_state=None, shrinking=True,\n", + " tol=0.001, verbose=False),\n", + " iid='warn', n_jobs=None,\n", + " param_grid={'C': [0.1, 1, 10, 100, 1000],\n", + " 'gamma': [1, 0.1, 0.01, 0.001,\n", + " 0.0001],\n", + " 'kernel': ['rbf']},\n", + " pre_dispatch='2*n_jobs', refit=True,\n", + " return_train_score=False, scoring=None,\n", + " verbose=3))],\n", + " verbose=False)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.fit(train['STORY'],train['SECTION'])" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SECTION
01
12
21
31
41
51
61
72
81
92
100
113
122
131
142
151
163
172
183
192
202
212
222
230
240
252
262
273
283
290
......
27180
27192
27203
27211
27220
27231
27242
27252
27261
27272
27281
27291
27303
27312
27323
27331
27340
27353
27360
27371
27381
27391
27403
27410
27420
27431
27441
27451
27460
27471
\n", + "

2748 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " SECTION\n", + "0 1\n", + "1 2\n", + "2 1\n", + "3 1\n", + "4 1\n", + "5 1\n", + "6 1\n", + "7 2\n", + "8 1\n", + "9 2\n", + "10 0\n", + "11 3\n", + "12 2\n", + "13 1\n", + "14 2\n", + "15 1\n", + "16 3\n", + "17 2\n", + "18 3\n", + "19 2\n", + "20 2\n", + "21 2\n", + "22 2\n", + "23 0\n", + "24 0\n", + "25 2\n", + "26 2\n", + "27 3\n", + "28 3\n", + "29 0\n", + "... ...\n", + "2718 0\n", + "2719 2\n", + "2720 3\n", + "2721 1\n", + "2722 0\n", + "2723 1\n", + "2724 2\n", + "2725 2\n", + "2726 1\n", + "2727 2\n", + "2728 1\n", + "2729 1\n", + "2730 3\n", + "2731 2\n", + "2732 3\n", + "2733 1\n", + "2734 0\n", + "2735 3\n", + "2736 0\n", + "2737 1\n", + "2738 1\n", + "2739 1\n", + "2740 3\n", + "2741 0\n", + "2742 0\n", + "2743 1\n", + "2744 1\n", + "2745 1\n", + "2746 0\n", + "2747 1\n", + "\n", + "[2748 rows x 1 columns]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predictions = pipeline.predict(test['STORY'])\n", + "output_svm_quote = pd.DataFrame(predictions,columns=['SECTION'])\n", + "output_svm_quote" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SECTION
01
12
21
31
41
51
61
72
81
92
100
113
122
131
142
151
163
172
183
192
202
212
222
230
240
252
262
273
283
290
......
27180
27192
27203
27211
27220
27231
27242
27252
27261
27272
27281
27291
27303
27311
27323
27331
27340
27353
27360
27371
27381
27391
27403
27410
27420
27431
27441
27451
27460
27471
\n", + "

2748 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " SECTION\n", + "0 1\n", + "1 2\n", + "2 1\n", + "3 1\n", + "4 1\n", + "5 1\n", + "6 1\n", + "7 2\n", + "8 1\n", + "9 2\n", + "10 0\n", + "11 3\n", + "12 2\n", + "13 1\n", + "14 2\n", + "15 1\n", + "16 3\n", + "17 2\n", + "18 3\n", + "19 2\n", + "20 2\n", + "21 2\n", + "22 2\n", + "23 0\n", + "24 0\n", + "25 2\n", + "26 2\n", + "27 3\n", + "28 3\n", + "29 0\n", + "... ...\n", + "2718 0\n", + "2719 2\n", + "2720 3\n", + "2721 1\n", + "2722 0\n", + "2723 1\n", + "2724 2\n", + "2725 2\n", + "2726 1\n", + "2727 2\n", + "2728 1\n", + "2729 1\n", + "2730 3\n", + "2731 1\n", + "2732 3\n", + "2733 1\n", + "2734 0\n", + "2735 3\n", + "2736 0\n", + "2737 1\n", + "2738 1\n", + "2739 1\n", + "2740 3\n", + "2741 0\n", + "2742 0\n", + "2743 1\n", + "2744 1\n", + "2745 1\n", + "2746 0\n", + "2747 1\n", + "\n", + "[2748 rows x 1 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predictions = pipeline.predict(test['STORY'])\n", + "output_svm = pd.DataFrame(predictions,columns=['SECTION'])\n", + "output_svm" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "output_svm_quote.to_excel('output_svm_quote.xlsx',sheet_name='Sheet1',index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "output_svm_skim.to_excel('output_svm_skim.xlsx',sheet_name='Sheet1',index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['sadva', 'vavb', 'is', 'what']" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "token = ['sadva','vavb','\"',\"'\",'`','is','what']\n", + "[word for word in token if word.lower() not in (stopwords.words('english') and ['\"',\"'\",'`','”','“'])]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/News_category_project/Sample_submission.xlsx b/News_category_project/Sample_submission.xlsx new file mode 100644 index 0000000..2b7ecbb Binary files /dev/null and b/News_category_project/Sample_submission.xlsx differ diff --git a/News_category_project/output.xlsx b/News_category_project/output.xlsx new file mode 100644 index 0000000..e604512 Binary files /dev/null and b/News_category_project/output.xlsx differ diff --git a/News_category_project/output_svm.xlsx b/News_category_project/output_svm.xlsx new file mode 100644 index 0000000..dbc1c37 Binary files /dev/null and b/News_category_project/output_svm.xlsx differ diff --git a/News_category_project/output_svm_quote.xlsx b/News_category_project/output_svm_quote.xlsx new file mode 100644 index 0000000..03c6918 Binary files /dev/null and b/News_category_project/output_svm_quote.xlsx differ diff --git a/News_category_project/output_svm_skim.xlsx b/News_category_project/output_svm_skim.xlsx new file mode 100644 index 0000000..39ab27f Binary files /dev/null and b/News_category_project/output_svm_skim.xlsx differ diff --git a/README.md b/README.md index 7d84014..1689496 100644 --- a/README.md +++ b/README.md @@ -1 +1,3 @@ -# Natural-Language-Processing \ No newline at end of file +# Natural-Language-Processing + +A project on Classifying the category of news articles is added \ No newline at end of file