-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfiles_read.py
68 lines (56 loc) · 1.63 KB
/
files_read.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import os
import string
import numpy
def add_read(path):
all_files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
emails = [mail for mail in all_files if (mail.endswith('.txt')) and not mail.endswith('justTitle.txt')]
#print emails
adds = [os.path.join(path, x) for x in emails]
#print adds
return adds
add_read("C:\\Users\\ANUP\\Documents\\machine_learning\\datasets\\500N-KPCrowd-v1.1\\CorpusAndCrowdsourcingAnnotations\\train")
def read_file(path):
f = open(path, "r")
f.seek(0)
cont = f.read()
f.close()
return cont
def read_all(adds):
all_text = []
for path in adds:
cont = read_file(path)
all_text.append(cont)
return all_text
def remove_punct(all_text):
for str in all_text:
text = str.translate(string.maketrans("", ""), string.punctuation)
nopunctext.append(text)
return nopunctext
def id_words(all_text):
mail = 0
iden = 0
ar = []
for mail in all_text:
words = [word for word in mail.split()]
words.sort()
dic = {}
for word in words:
if word not in dic.keys():
dic.update({word: iden})
iden = iden + 1
ar.append(dic)
return ar
#text = ["text is data type data", "hitesh chutiya hai hitesh", "divyansh is an working ass hole ass" ]
#ar = id_words(text)
def feat_arr(function, mails, arrK) : #mails is list of mails(string), arrk is array of dict of every mail
mailNo=0
listO = []
for mail in mails :
dict = function(mail)
dict1 = arrK[mailNo]
for key in dict.keys() :
iden = dict1[key]
list1 = [mailNo, iden, dict[key]]
listO.append(list1)
mailNo = mailNo+1
return listO