PTEexperiments/index.html at master · JingfengYang/PTEexperiments · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
<!-- Maintainer: Xingchi Li, https://career.lixingchi.com -->
<!DOCTYPE html>
<html lang="en-us">

<head>
    <meta charset="UTF-8">
    <title>Pte: Predictive text embedding through large-scale heterogeneous text networks.</title>
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <link rel="stylesheet" type="text/css" href="stylesheets/normalize.css" media="screen">
    <link href='https://fonts.googleapis.com/css?family=Open+Sans:400,700' rel='stylesheet' type='text/css'>
    <link rel="stylesheet" type="text/css" href="stylesheets/stylesheet.css" media="screen">
    <link rel="stylesheet" type="text/css" href="stylesheets/github-light.css" media="screen">
    <script src="https://cdn.jsdelivr.net/gh/google/code-prettify@master/loader/run_prettify.js"></script>
    <script type="text/javascript" , src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.1.0/jquery.js"></script>
    <script type="text/javascript" , src="https://d3js.org/d3.v5.min.js"></script>
    <script type="text/javascript" , src="https://d3js.org/d3-dsv.v1.min.js"></script>
    <script type="text/javascript" , src="https://d3js.org/d3-fetch.v1.min.js"></script>
    <script type="text/javascript" , src="https://d3js.org/d3-color.v1.min.js"></script>
    <script type="text/javascript" , src="https://d3js.org/d3-interpolate.v1.min.js"></script>
    <script type="text/javascript" , src="https://d3js.org/d3-scale-chromatic.v1.min.js"></script>
    <script type="text/javascript" , src="https://unpkg.com/topojson@3"></script>
    <script type="text/javascript" , src="javascripts/d3-tip.min.js"></script>
    <script type="text/x-mathjax-config">
        MathJax.Hub.Config({ tex2jax: { skipTags: ['script', 'noscript', 'style', 'textarea', 'pre'], inlineMath: [['$','$']] } });
    </script>
    <script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" type="text/javascript"></script>
</head>

<body>
    <section class="page-header">
        <h1 id="pdafjios" class="project-name">Pte: Predictive text embedding through large-scale heterogeneous text networks.</h1>
        <h2 class="project-tagline">CS7641 Machine Learning Project</h2>
        <h2 class="project-tagline">Bing He, Huili Huang, Xingchi Li, Zuoxin Tang, Jingfeng Yang</h2>
        <a href="https://github.com/JingfengYang/PTEexperiments" class="btn">View on GitHub</a>
        <a href="proposal.pdf" class="btn">View Proposal</a>
    </section>

    <section class="main-content">
        <h2>
            <a id="welcome-to-github-pages" class="anchor" href="#welcome-to-github-pages" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Introduction
        </h2>
        <h3>
            <a id="welcome-to-github-pages" class="anchor" href="#welcome-to-github-pages" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Motivation
        </h3>
        <p>
            The text contains numerous effective and meaningful information for people. How to learn the meaning of words is a critical prerequisite for many machine learning tasks. Traditionally, people represent words independently to each other and represent each
            document as a "bag of words". However, the sparsity, polysemy, and synonymy of words and documents are commonly ignored during training.
        </p>
        <p>
            One way to deal with it is representing words and documents in low-dimensional spaces[1]. By the unsupervised learning process, similar words and documents are embedded closely to each other. For example, Mikilov et al. proposed Skip-gram using
            the embedding of the target words to predict the embedding of individual context word in a local window. [1]. Compared with other classical approaches that utilize the distributional similarity of word contest, this text embedding approach
            is more efficient and easier to deploy[2].
        </p>
        <p>
            However, compared with deep learning approaches such as convolutional neural networks (CNNs), the precision of text embedding method would be lower. This is because a method like CNNs leverage labeled information thus they can learn the representation
            of the data better. In other words, the unsupervised method would be generalizable for a different task but have weaker predictive power for a particular task[2]. However, compared with text embedding, CNNs training also has a drawback. It
            requires numerous parameters which is not only time consuming but also difficult to train an ideal model.
        </p>
        <p>
            To sum up, text embedding methods are more efficient while deep learning neural networks are more predictive. One can consider to take the advantage of both supervised and unsupervised labels at the same time.
        </p>
        <h3>
            <a id="welcome-to-github-pages" class="anchor" href="#welcome-to-github-pages" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Project scope
        </h3>
        <p>
            In this project, we deploy Predictive Text Embedding (PTE), a semi-supervised representation learning method for text data to fill the gap between supervised and unsupervised labels. By learning from limited labeled examples and a large number of unlabelled
            examples, we obtain an effective low dimensional representation, which is the information between words and words, words and labels and words and documents.
        </p>
        <p>
            We build two kinds of training data: one has a network encodes different levels of co-occurrence information between words and words while other does not. Then, we conduct experiments with sentiment data sets MR, short document data sets DBLP and long
            document data sets 20NG. Experimental results show that the data which has the word and word information outperform the data without this embedding information in various text classification tasks.
        </p>
        <h2>
            <a id="designer-templates" class="anchor" href="#designer-templates" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Methods
        </h2>
        <p>
            In semi-supervised sentence classification, to leverage both annotated data and unannotated text data, there are three steps in our method. We first build a heterogeneous graph whose edge consists of word-sentence, word-word, and word-label. Then we learn
            the embeddings of each vertex in the graph. Finally, we could use them as input of the text classification task. This framework is proposed by <a href="https://arxiv.org/abs/1508.00200">Tang et al.</a>.
        </p>
        <p>
            The advantage of using this framework are two-fold. The first advantage is that this heterogeneous graph have both labels and words as its vertices. Thus, it can leverage the supervision information of the labels when learning word embeddings, which is
            more promising than self-supervised way to learning word embeddings like [1] . Another advantage is that we can incorporate as much unlabeled data in the graph, and the supervision information can flow to unlabeled words through label-word
            edges and word-word edges.
        </p>
        <h4>
            <a id="welcome-to-github-pages" class="anchor" href="#welcome-to-github-pages" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Building Heterogeneous Bipartite Graph
        </h4>
        <p>
            In the first step, we build word-word network, word-document network and word-label network. Word-word co-occurrence network, denoted as $G_{ww}=(\mathcal{V},E_{ww})$, captures the word co-occurrence information in local contexts of the unlabeled data.
            $\mathcal{V}$ is a vocabulary of words and $E_{ww}$ is the set of edges between words. The weight $w_{ij}$ of the edge between word $v_i$ and $v_j$ is defined as the number of times that the two words co-occur in the context windows of a given
            window size. Word-document etwork, denoted as $G_{wd}=(\mathcal{V} \cup \mathcal{D},E_{wd})$, is a bipartite network where $\mathcal{D}$ is a set of documents and $\mathcal{V}$ is a set of words. $E_{wd}$ is the set of edges between words
            and documents. The weight $w_{ij}$ between word $v_i$ and document $d_j$ is simply defined as the number of times $v_i$ appears in document $d_j$. Word-label network, denoted as $G_{wl}=(\mathcal{V} \cup \mathcal{L},E_{wl})$, is a bipartite
            network that captures category-level word co-occurrences. $\mathcal{L}$ is a set of class labels and $\mathcal{V}$ a set of words. $E_{wl}$ is a set of edges between words and classes. The weight $w_{ij}$ of the edge between word $v_i$ and
            class $c_j$ is defined as: $w_{ij}=\sum_{(d:l_d=j)} n_{di}$, where $n_{di}$ is the term frequency of word $v_i$ in document $d$, and $l_d$ is the class label of document $d$. Here is an example of these three kinds of bipartite network:

        </p>
        <img src="example.png" alt="samples" />

        <h4>
            <a id="welcome-to-github-pages" class="anchor" href="#welcome-to-github-pages" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Learning Node Embeddings
        </h4>
        <p>
            Before introducing the second step of learning node embeddings, we first generally introduce the way of learning embeddings in bipartite network. Given a bipartite network $G=(\mathcal{V}_A \cup \mathcal{V}_B, E)$, where $\mathcal{V}_A$ and $\mathcal{V}_B$
            are two disjoint sets of vertices of different types, and $E$ is the set of edges between them. We first define the conditional probability of vertex $v_i$ in set $\mathcal{V}_A$ generated by vertex $v_j$ in set $\mathcal{V}_B$ as: $$p(v_i|v_j)=\frac{\exp(\vec{u}_i^T\cdot
            \vec{u}_j)}{\sum_{i'\in A} \exp({\vec{u}_{i'}}^T \cdot \vec{u}_j)}$$ where $\vec{u}_i$ is the embedding vector of vertex $v_i$ in $\mathcal{V}_A$, and $\vec{u}_j$ is the embedding vector of vertex $v_j$ in $\mathcal{V}_B$. To preserve the
            second-order proximity, we can make the conditional distribution $p(\cdot|v_j)$ be close to its empirical distribution $\hat{p}(\cdot|v_j)$, which can be achieved by minimizing the following objective function: $$O=\sum_{j\in B} \lambda_j
            d(\hat{p}(\cdot|v_j), p(\cdot|v_j))$$ where $d(\cdot,\cdot)$ is the KL-divergence between two distributions, $\lambda_j$ is the importance of vertex $v_j$ in the network, which can be set as the degree $deg_j=\sum_i w_{ij}$, and the empirical
            distribution can be defined as $\hat{p}(v_i|v_j)=\frac{w_{ij}}{deg_j}$. Omitting some constants, the objective function can be calculated as: $$O=-\sum_{(i,j)\in E} w_{ij}\log p(v_j|v_i)$$ To learn the embeddings of our heterogeneous text
            network, we minimize the following objective function: $$O_{pte}= O_{ww}+O_{wd}+O_{wl}$$ where $$O_{ww}=-\sum_{(i,j)\in E_{ww}} w_{ij} \log p(v_i|v_j)$$ $$O_{wd}=-\sum_{(i,j)\in E_{wd}} w_{ij}\log p(v_i|d_j)$$ $$O_{wl}=-\sum_{(i,j)\in E_{wl}}
            w_{ij}\log p(v_i|l_j)$$ We use edge sampling and negtive sampling to jointly training the three oblective by stochastic gradient descent. Here is the procedure:

        </p>

        <!--         <pre class="prettyprint">
Data: {G_ww, G_wd, G_wl}, number of samples T, number of negative samples K.
Result: word embedding w.
while iter <= T do
    sample an edge from E_ww and draw K negative edges, and update the word embeddings;
    sample an edge from E_wd and draw K negative edges, and update the word and document embeddings;
    sample an edge from E_wl and draw K negative edges, and update the word and label embeddings;
end</pre> -->

        <img src="alg.png" alt="algorithm" />

        <h4>
            <a id="welcome-to-github-pages" class="anchor" href="#welcome-to-github-pages" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Classification Methods
        </h4>


        <p>
            In the third step, we use three kinds of classification algorithms to test the performance of our embeddings. For each algorithm, we use the embeddings trained with unlabled sentences and witout unlabeled sentences. We guess that the embeddings trained
            with unlabled sentences performs better, which show the effectiveness of PTE as a semi-supervised framework. We first use SVM and logistic regression as the classification algorithm. The input is the document embedding. Where a piece of text
            $d=w_1w_2\cdots,w_n$ iscomputed as $$\vec{d}=\frac{1}{n}\sum_{i=1}^n \vec{u}_i,$$ where $\vec{u}_i$ is the embedding of word $w_i$. In fact, the average of the word embeddings is the solution to minimizing the following objective function:
            $$O=\sum_{i=1}^{n} l(\vec{u}_i, \vec{d})$$ where the loss function $l(\cdot,\cdot)$ between the word embedding $\vec{u}_i$ and text embedding $\vec{d}$ is specified as the Euclidean distance. Then we use CNN as the classfier, where the input
            are word embeddings.

        </p>

        <h4>
            <a id="welcome-to-github-pages" class="anchor" href="#welcome-to-github-pages" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Text Classification Datasets
        </h4>
        <p>
            We use three text classification datasets to report the performance. One of them is long document, i.e. paragraph classification. Two of them are short documents, i.e. sentence classification. 20ng is a widely used long document classification data set
            <a href="http://qwone.com/~jason/20Newsgroups/">20newsgroup</a>, containing 20 categories. Dblp which contains titles of papers from <a href="http://arnetminer.org/billboard/citation">the computer science bibliography</a>. There
            are six diverse research fields for classification including "database,'' "artificial intelligence'', "hardware'', "system'', "programming languages'' and "theory''. For each field, representative conferences are selected and the papers published
            in the selected conferences are collect ed as the labeled documents. This dataset is a short document classification task.

            <a href="http://www.cs.cornell.edu/people/pabo/movie-review-data/">Mr</a>, a movie review data set, in which each review only contains one sentence and one of the two sentiment plority labels, i.e. positive or negtive . To test the performance
            in semi-supervised settings, in each data set, we not only use the whole labeled training set, but also use 12.5%, 25% and 50% labeled data respectively. The remaining training data is considered as unlabeled data.
        </p>
        <h2>
            <a id="creating-pages-manually" class="anchor" href="#creating-pages-manually" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Results
        </h2>
        <h3>
            <a class="anchor" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Results for DBLP dataset
        </h3>
        <p id="word-label">
            We will be able to determine the approach with the best performance using the three datasets and the contribution of labeled and unlabeled data in semi-supervised learning to the final result, as well as the efficiency and the parameter used. Finally,
            we also present document visualizations to vividly illustrate our result. As seen in the following graph, we have separated the nodes into two classifications: text and label. There are six labels in the embedding, connecting the texts using
            yellow-ish lines. Among the $1.25$ million texts, we picked around $100$ texts to show what we have done, the link between which are colored steelblue.
        </p>
        <h3>
            <a id="creating-pages-manually" class="anchor" href="#creating-pages-manually" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Results for MR dataset
        </h3>
        <p id="word-label-mr">
            There are $600$ thousand words and only two labels in the MR dataset. We are able to pick $100$ of them to make a clear relationship out of them. Similar to DBLP dataset, we used yellow line to draw the word-label relationship and line in steelblue to
            draw the word-word relationship line.
        </p>
        <h2 id="graph-convolutional-network-trial">Graph Convolutional Network Trial</h2>
        <h3 id="GCN Illustration">Graph Convolutional Network Illustration</h3>
        <img src="gcnTransform.png" alt="samples" />
        <h3 id="procedure">Procedure</h3>
        <ul>
            <li>
                Remove stop word: we remove the stop words and low-frequency words (conducted in the PTE preprocessing).
            </li>
            <li>
                Build graph: we build graphs by the network construction results in PTE model because in the PTE graph construction processing, we have three graphs: (1) word-word network (2) word-document network (3) word-label network; we can use the first two graphs
                to build a new graph where each node can be a word/document and an edge indicates the co-currence (As shown in the above graph, the first two graphs are conveted into one graph, the third label graph is just used to set the node label). To normalize it, we use TF-IDF instead. (For computing convenience, we directly use the text data.)
            </li>
            <li>
                Learn graph convolutional network: once a graph is built, we use the classical GCN learning method, where we only have two layers.
            </li>
        </ul>
        <h3 id="evaluation">Evaluation</h3>
        <p>
            We compare PTE and GCN by MR dataset. As we can see, GCN has a relatively higher F1 score. The reason is that GCN usually learns a better graph-structure representation than the embedding vector by PTE. Hence, GCN can achieve a better prediction result.
            <table>
                <tr>
                    <th></th>
                    <th style="text-align:center">PTE</th>
                    <th style="text-align:center">GCN</th>
                </tr>
                <tr>
                    <td>F1-micro/F1-macro</td>
                    <td style="text-align:center">0.74</td>
                    <td style="text-align:center">0.76</td>
                </tr>
            </table>
        </p>
        <h2>
            <a id="creating-pages-manually" class="anchor" href="#creating-pages-manually" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Line charts comparison of Linear Regression, CNN and Support Vector Machine
        </h2>
        <p id="mr-line">
            <ol>
                <li>Different classification results:<br/> As we can see in the result, SVM mainly beats LR and CNN in the metric of F1 micro/macro scores under this text classification task. The reason may be that SVM can better caputure the embedding space
                    features and make a good prediction result.
                </li>
                <li>Different label ratio results:<br/> The result shows that with the percentage of the labeled data increasing (i.e., 0.125, 0.25, 0.5, 1), the F1 score increases accordingly. It is easy to interpret this finding since with more labeled
                    data, we can learn a more precise model from the objective function. More labeled data indicates a bigger feature space that can be learned.
                </li>
                <li>Different supervised learning setting: Semi-supervised advantage:<br/> In the experiment, we also test the built graph under two kinds of dataset: (1) graph with labeled data only; (2) graph with labeled and unlabeled data together. As is shown
                    in the result, the added unlabeled data can improve the F1 score. This is because when the unlabeled data is added in the graph construction process, a better word embedding is generated, leading to a higher F1 score of labeled+unlabel
                    data than that of labeled data.
                </li>
            </ol>
            <table style="width:100%">
                <tr>
                    <th>F1-MACRO</th>
                    <th>F1-MICRO</th>
                </tr>
                <tr>
                    <td id="mult-line-a-mr-1"></td>
                    <td id="mult-line-i-mr-1"></td>
                </tr>
                <tr>
                    <td id="mult-line-a-mr-2"></td>
                    <td id="mult-line-i-mr-2"></td>
                </tr>
                <tr>
                    <td id="mult-line-a-mr-3"></td>
                    <td id="mult-line-i-mr-3"></td>
                </tr>
                <tr>
                    <td id="mult-line-a-dblp-1"></td>
                    <td id="mult-line-i-dblp-1"></td>
                </tr>
                <tr>
                    <td id="mult-line-a-dblp-2"></td>
                    <td id="mult-line-i-dblp-2"></td>
                </tr>
                <tr>
                    <td id="mult-line-a-dblp-3"></td>
                    <td id="mult-line-i-dblp-3"></td>
                </tr>
                <tr>
                    <td id="mult-line-a-20ng-1"></td>
                    <td id="mult-line-i-20ng-1"></td>
                </tr>
                <tr>
                    <td id="mult-line-a-20ng-2"></td>
                    <td id="mult-line-i-20ng-2"></td>
                </tr>
                <tr>
                    <td id="mult-line-a-20ng-3"></td>
                    <td id="mult-line-i-20ng-3"></td>
                </tr>
            </table>
		(X label: the percentage of labeled data, Y label: F1-Micro/F1-Macro)
        </p>
        <h2>
            <a id="creating-pages-manually" class="anchor" href="#creating-pages-manually" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Scatter Plot of MR
        </h2>
        <p id="scatter-mr">
            There are two labels in the MR dataset. We used <code class="prettyprint">sklearn.manifold.TSNE</code> by extracting the two main components from the dataset containing $0.6$ million texts and labels, thus better illustrating the idea of the
            embedding.
        </p>
        <h2>
            <a id="creating-pages-manually" class="anchor" href="#creating-pages-manually" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Scatter Plot of DBLP
        </h2>
        <p id="scatter-dblp">
            There are six labels in the DBLP dataset. We used <code class="prettyprint">sklearn.manifold.TSNE</code> by extracting the two main components from the dataset containing $1.25$ million texts and labels. To better illustrate the idea of the
            embedding, we added the functionality to choose among the labels to show. The label input accepts number inputs including number $1$ to $6$ and the graph is generated based on the input labels which by default is $123456$, i.e. including all
            the labels. (Please have patience as the graph is generated real-time, thus depending on your network and processor.)<br/>
            <label for="labelValue" style="display: inline-block; width: 240px; text-align: right">
                 label = <span id="labelValue-value"></span>
            </label>
            <input type="text" value="123456" id="labelValue">
        </p>
        <h2>
            <a id="authors-and-contributors" class="anchor" href="#authors-and-contributors" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Discussion
        </h2>
        <h3>
            <a class="anchor" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Conclusions
        </h3>
        <ul>
            <li>
                PTE relates the unlabeled training data directly to the specific task, making it outperform methods with pure supervised learning and those training word/semantic embedding separately. Our experiments also show that the more unlabeled data
                we add, the more improvement we will obtain over pure supervised learning. In conclusion, PTE is well suited for task-specific feature extraction.
            </li>
            <li>
                Since PTE is a graph-based algorithm, utilizing GCN can further enhance the performance. Our tests in GCN with methods proposed in <a href="https://arxiv.org/abs/1609.02907">Semi-Supervised Classification with Graph Convolutional Networks</a>
		show that GCN is better, probably because it takes advantage of both graph structure proposed in PTE and the representation power of neural network.
            </li>
        </ul>
        <h3>
            <a class="anchor" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Future work
        </h3>
        <ul>
            <li>
                In the MR dataset, a large portion of the embeddings blend together even though there are only two classifications in MR. We suspect that this is because some of key words like "but", "nevertheless" can change the sentiment of the text rapidly. Because of
                this nature of sentiment analysis task, we expect that attension mechanism will further improve our performance.</il>
                <li>
                    Since we have different kinds of node in the graph: word, document, label, we can modify current methods based on heterogeneity of our graph. </li>
                <li>
                    GCN is a more generic and powerful framework for graph-based algorithm which has been shown in our experiments. It is promising to deploy more advanced GCN training algorithm appeared more recently. </li>
        </ul>
        <h2>
            <a id="support-or-contact" class="anchor" href="#support-or-contact" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Contribution
        </h2>

        <ul>
                <li>Proposal - Jingfeng Yang, Huili Huang, Xingchi Li, Bing He, Zuoxin Tang </li>
                <li>Website Designing - Xingchi Li</li>
                 <li> Report and website content - Jingfeng Yang, Huili Huang, Xingchi Li, Bing He, Zuoxin Tang</li>
                <li>Visualization of Data - Xingchi Li</li>
                <li>Preprocessing of two short document datasets - Jingfeng Yang </li>
                <li>Preprocessing of long document datasets - Huili Huang </li>
                <li>Using PTE to generate Text Embeddings - Jingfeng Yang</li>
                <li>Logistic Regression Experiment - Huili Huang</li>
                <li>SVM Experiment - Bing He</li>
                <li>Graph Convolutional Network Experiment - Bing He</li>
                <li>CNN Experiment - Zuoxin Tang</li>
        </ul>

        <h2>
            <a id="support-or-contact" class="anchor" href="#support-or-contact" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>References
        </h2>
        <p>
            <ol>
                <li>
                    T. Mikolov, I. Sutskever, K. Chen, G. S. Corrado, and J. Dean. Distributed representations of words and phrases and their compositionality. In NIPS, pages 3111–3119, 2013.
                </li>
                <li>
                    Jian Tang, Meng Qu, and Qiaozhu Mei. Pte: Predictive text embedding through large-scale heterogeneous textnetworks. InProceedings of the 21th ACM SIGKDD International Conference on Knowledge Discovery and DataMining, KDD ’15, pages 1165–1174, New York,
                    NY, USA, 2015. ACM.
                </li>
                <li>
                    Thomas N Kipf and Max Welling. Semi-supervised classification with graph convolutional networks.arXiv preprintarXiv:1609.02907, 2016.
                </li>
                <li>
                    Liang Yao, Chengsheng Mao, and Yuan Luo. Graph convolutional networks for text classification. InProceedings ofthe AAAI Conference on Artificial Intelligence, volume 33, pages 7370–7377, 2019.
                </li>

            </ol>
        </p>

        <footer class="site-footer">
            <span class="site-footer-owner"><a href="https://github.com/nathanxli/PTEexperiments">Pte: Predictive text embedding through large-scale heterogeneous text networks.</a> is maintained by <a href="https://github.com/binghesam">Bing He</a>, <a href="https://github.com/HUILIHUANG413">Huili Huang</a>, <a href="https://career.lixingchi.com">Xingchi Li</a>, <a href="https://github.com/HyperCubic">Zuoxin Tang</a>, <a href="https://github.com/JingfengYang">Jingfeng Yang</a>.</span>

            <span class="site-footer-credits">This page was generated by <a href="https://pages.github.com">GitHub Pages</a> using the <a href="https://github.com/jasonlong/cayman-theme">Cayman theme</a> by <a href="https://twitter.com/jasonlong">Jason Long</a>.</span>
        </footer>

    </section>

</body>

<script type="text/javascript" src="javascripts/word-label.js"></script>
<script type="text/javascript" src="javascripts/word-label-mr.js"></script>
<!-- <script type="text/javascript" src="javascripts/mr-line.js"></script> -->
<script type="text/javascript" src="javascripts/mult-line.js"></script>
<script type="text/javascript" src="javascripts/scatter-mr.js"></script>
<script type="text/javascript" src="javascripts/scatter-dblp.js"></script>

<!-- <script type="text/javascript">
		var width = 960,
	height = 500;
var color = d3.scaleOrdinal(d3.schemeCategory10);
// create force layout
var force = d3.forceSimulation().charge(-120).linkDistance(30).size([width, height]).distance(200);
var svg = d3.select("#bipartite").append("svg").attr("width", width).attr("height", height).append('svg:g').call(d3.behavior.zoom().on("zoom", redraw));
function redraw() {
	svg.attr("transform", "translate(" + d3.event.translate + ")" + " scale(" + d3.event.scale + ")");
	node.attr("transform", function(d) {
		return "translate(" + d.x + "," + d.y + ")";
	});
}
d3.json("bip.json", function(error, graph) {
	console.log("graph.nodes", graph.nodes)
	console.log("graph.links", graph.links)
	// start force layout
	force.nodes(graph.nodes).links(graph.links).start();
	var link = svg.selectAll(".link").data(graph.links).enter().append("line").attr("class", "link")
	.style("stroke-width", function(d) {
		return Math.sqrt(d.value);
	});
	var node=svg.selectAll('g.node').data(graph.nodes).enter().append('svg:g').attr('class','node').call(force.drag)
	var circles=node.append('circle').attr("r", 5).style("fill", function(d) {
		return color(d.group);
	});
	var texts=node.append('svg:text')
	.attr("dx", 12)
      .attr("dy", ".35em")
	.text(function(d){return d.name});
	node.append("title").text(function(d) {
		return d.name;
	});
	force.on("tick", function() {
		link.attr("x1", function(d) {
			return d.source.x;
		}).attr("y1", function(d) {
			return d.source.y;
		}).attr("x2", function(d) {
			return d.target.x;
		}).attr("y2", function(d) {
			return d.target.y;
		});
		node.attr("transform", function(d) {
			var x=d.x,
				y=d.y;
			if (x>300 && d.group=='tuple'){
				x=300
				d.x=x
			}
			if (x<500 && d.group=='pattern'){
				x=500
				d.x=x
			}
			return "translate(" + x + "," + y + ")";
		});
	});
});
	</script> -->

</html>