-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathyao_file_size.py
154 lines (131 loc) · 5.93 KB
/
yao_file_size.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import yaoner
import os
import sys
import utility
__author__ = 'Frank'
def generate_by_type(ratio_index, file_type_index, output_filename):
# with open(output_filename, 'w') as output_file:
with open(ratio_index) as ratio_index_file, open(file_type_index) as file_type_index_file:
file_type_map = dict()
for line in file_type_index_file:
words = line.split()
filename = words[0]
filetype = words[1]
file_type_map[filename] = filetype
result_map = dict()
count = 0
for line in ratio_index_file:
words = line.split()
filename = words[0]
file_ratio = words[1]
if filename in file_type_map:
filetype = file_type_map[filename]
if filetype not in result_map:
result_map[filetype] = list()
file_list = result_map[filetype]
file_list.append((filename, file_ratio))
else:
print(count, filename)
count += 1
type_list = list()
min_list = list()
max_list = list()
average_list = list()
for entry in result_map.keys():
# output_file_final_name = output_filename + entry + '.tsv'
# utility.create_parent_folder_if_needed_for_output_file(output_file_final_name)
# with open(output_file_final_name, 'w') as output_file2:
# for item in result_map[entry]:
# output_file2.write('\t'.join(item))
# output_file2.write('\n')
the_list = result_map[entry]
if len(the_list) < 2:
continue
type_list.append(entry)
# print(the_list[1])
min_value = float(the_list[0][1])
max_value = float(min_value)
average_value = max_value
for i in range(1, len(the_list)):
value = float(the_list[i][1])
if value < min_value:
min_value = value
elif value > max_value:
max_value = value
average_value = (i / float(i + 1)) * average_value + (1 / float(i + 1)) * value
min_list.append(min_value)
max_list.append(max_value)
average_list.append(average_value)
# with open(output_filename, 'w') as output_file:
print(type_list)
print(min_list)
print(max_list)
print(average_list)
return
def generate_ratio_of_metadata_to_file(metadata_index, file_index, output_filename):
utility.create_parent_folder_if_needed_for_output_file(output_filename)
with open(output_filename, 'w') as output_file:
with open(metadata_index) as metadata_index_file, open(file_index) as file_index_file:
content = list()
content.append('filename\tratio')
file_dictionary = dict()
for line in file_index_file:
words = line.split()
filename = words[0]
filesize = words[1]
file_dictionary[filename] = float(filesize)
for line in metadata_index_file:
words = line.split()
filename = words[0]
metadata_json_size = float(words[1])
if filename in file_dictionary:
content.append('\t'.join([filename, str(metadata_json_size / file_dictionary[filename])]))
for entry in content:
output_file.write(entry)
output_file.write('\n')
return
def combine_shell_ls_output(filename_list, output_name):
# to combine the metadata size of JSON files
content = list()
with open(output_name, 'w') as output_file:
for filename in filename_list:
with open(filename) as input_file:
for line in input_file:
words = line.split()
if len(words) == 9:
file_size = words[4]
file_hash_id = words[8].split('.')[0]
this_line = ' '.join([file_hash_id, file_size])
content.append(this_line)
for line in content:
output_file.write(line)
output_file.write('\n')
return
def run_for_file_size(output_name):
index_file = '/Users/Frank/PycharmProjects/599assignment1/geo-topic-parser-folder/geo-topic-all-files.txt'
base_directory = '/Users/Frank/Desktop/fulldump/raw-dataset/'
with open(output_name, 'w') as output_file:
file_list = yaoner.read_index_file(index_file, base_directory)
file_size_list = list()
for file_path in file_list:
file_size = os.path.getsize(''.join([base_directory, file_path]))
# file_name = os.path.basename(file_path)
file_size_list.append(file_size)
for idx, val in enumerate(file_size_list):
entry = str(file_size_list[idx])
file_name = os.path.basename(file_list[idx])
output_file.write(' '.join([file_name, entry]))
# output_file.write(' '.join([val[0], val[1]]))
output_file.write('\n')
def main():
# run_for_file_size('/Users/Frank/working-directory/filesize/file-size1.txt')
# combine_shell_ls_output(sys.argv[1:], '/Users/Frank/working-directory/filesize/metadata-file-size.txt')
# generate_ratio_of_metadata_to_file('/Users/Frank/working-directory/filesize/metadata-file-size.txt',
# '/Users/Frank/working-directory/filesize/file-size.txt',
# '/Users/Frank/working-directory/filesize/ratio-of-metadata-to-file.tsv')
generate_by_type('/Users/Frank/working-directory/filesize/ratio-of-metadata-to-file.tsv',
'/Users/Frank/working-directory/fulldump/file-type-java.txt',
'/Users/Frank/working-directory/filesize/output/')
return
if __name__ == '__main__':
main()