This repository has been archived by the owner on Jul 22, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
cluster.py
88 lines (70 loc) · 2.74 KB
/
cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/env python3.4
#
# @file classify.py
# @brief Use a CDE to classify GitHub projects
# @author Matthew Graham
#
# <!---------------------------------------------------------------------------
# Copyright (C) 2017 by the California Institute of Technology.
# This software is part of CASICS, the Comprehensive and Automated Software
# Inventory Creation System. For more information, visit http://casics.org.
# ------------------------------------------------------------------------- -->
import dag
import CDE
from process-readme import preprocess, tfidf_vectorizer, svd_vectorizer
import sys
import os
sys.path.append('/Users/mjg/Projects/casics/casics/src')
import casics
if __name__ == '__main__' and __package__ is None:
sys.path.append(os.path.join(os.path.dirname(__file__), "../database"))
from casicsdb import *
# Main body.
# .............................................................................
def main():
# Get label hierarchy
concepts = DAG("lcsh.graphml")
# Get training set
ids, X, y = get_training_set()
# Initialize CDE
cde = CDE(concepts)
# Train CDE
cde.fit(X, y)
# Helpers
# .............................................................................
def get_training_set():
'''Load the training set of annotated repos from the db'''
# Connect to db
casicsdb = CasicsDB(server = 'synonym.caltech.edu', port = '9988', username = 'mjg', password = 'casicsfun')
github_db = casicsdb.open('github')
# Get repos
allrepos = github_db.repos
# Get repos with LCSH annotations
repos = allrepos.find({'topics.lcsh': {'$ne': []}})
# Parse text per repo to create feature vectors
ids, docs, labels = get_text(repos)
docs_preprocess = map(lambda doc: preprocess(doc), docs) # stemming string
tfidf_matrix = tfidf_vectorizer(docs_preprocess) # convert to td-idf matrix
svd_vect = svd_vectorizer(tfidf_matrix, n_components = 200, n_iter = 150) # reduce dimeensions
return ids, svd_vect, labels
def get_text(repos):
''' Construct vectors of text per repo '''
text = []
labels = []
ids = []
for repo in repos:
descrip = ''
readme = ''
ids.append(repo['id_'])
if repo['description']: descrip = repo['description']
if repo['readme'] and repo['readme'] != -1:
readme = repo['readme'] # bytes object
if len(descrip) + len(readme) > 0:
text.append(descrip + " " + str(readme))
labels.append(repo['topics']['lcsh'])
return ids, text, labels
# Quick test interface.
# .............................................................................
if __name__ == '__main__':
import pprint
msg(pprint.pprint(file_elements(sys.argv[1])))