-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpositional_encoding.py
75 lines (60 loc) · 3.61 KB
/
positional_encoding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import numpy as np
import pandas as pd
import argparse
import os
def generate_positional_encoding(num_positions, num_features):
"""
Generates a positional encoding matrix of size (num_positions, num_features).
Args:
num_positions (int): Number of positions in the time series (total length of the combined dataset).
num_features (int): Number of dimensions for the positional encoding, must be even.
Returns:
np.ndarray: Positional encoding matrix of size (num_positions, num_features).
"""
encoding = np.zeros((num_positions, num_features))
for pos in range(num_positions):
for i in range(0, num_features, 2):
div_term = 10000 ** (2 * (i // 2) / num_features)
encoding[pos, i] = np.sin(pos / div_term)
encoding[pos, i + 1] = np.cos(pos / div_term)
return encoding
def main():
# Set up command line arguments
parser = argparse.ArgumentParser(description="Add positional encoding to multiple CSV files.")
parser.add_argument("input_file", type=str, help="Filename of the first CSV input file.")
parser.add_argument("--file2", type=str, default='..\\Documents\\encoder_eval_d2_indicators_128.csv', help="Filename of the second CSV input file (default: file2.csv).")
parser.add_argument("--file3", type=str, default='..\\Documents\\encoder_eval_d3_indicators_128.csv', help="Filename of the third CSV input file (default: file3.csv).")
parser.add_argument("--output1", type=str, default="pos_encoded_file1.csv", help="Output filename for the first file (default: pos_encoded_file1.csv).")
parser.add_argument("--output2", type=str, default="pos_encoded_file2.csv", help="Output filename for the second file (default: pos_encoded_file2.csv).")
parser.add_argument("--output3", type=str, default="pos_encoded_file3.csv", help="Output filename for the third file (default: pos_encoded_file3.csv).")
args = parser.parse_args()
# Load the datasets and check if all files exist
file_list = [args.input_file, args.file2, args.file3]
dfs = []
for file in file_list:
if not os.path.exists(file):
print(f"Error: The file '{file}' does not exist.")
return
dfs.append(pd.read_csv(file, header=None))
# Calculate total positions and features for positional encoding
num_features = dfs[0].shape[1] # Assume all files have the same number of columns
num_positions = sum(len(df) for df in dfs) # Total number of rows across all files
# Generate positional encoding for the entire dataset sequence
encoding = generate_positional_encoding(num_positions, num_features)
# Split the encoding based on the length of each dataset and add to each DataFrame
start_idx = 0
for i, df in enumerate(dfs):
end_idx = start_idx + len(df)
df_encoding = encoding[start_idx:end_idx, :]
# Create a DataFrame for positional encoding columns
encoding_df = pd.DataFrame(df_encoding, columns=[f'pos_enc_{j}' for j in range(num_features)])
# Concatenate the original DataFrame with the positional encoding DataFrame
df = pd.concat([df.reset_index(drop=True), encoding_df.reset_index(drop=True)], axis=1)
# Save each DataFrame with positional encoding to a new CSV file
output_filename = getattr(args, f"output{i+1}")
df.to_csv(output_filename, index=False, header=False)
print(f"Positional encoding added and saved to '{output_filename}'.")
# Update the starting index for the next file
start_idx = end_idx
if __name__ == "__main__":
main()