|
1 | | -import os |
2 | | -import pandas as pd |
3 | | -import Levenshtein as lv |
4 | | -import unicodedata |
5 | | -import numpy as np |
6 | | -from openpyxl.utils import column_index_from_string as cifs |
7 | | -import configparser |
8 | | - |
9 | | -columnsNumbers = [] |
10 | | - |
11 | | -RED = '\033[91m' |
12 | | -GREEN = '\033[92m' |
13 | | -BLUE = '\033[94m' |
14 | | -RESET = '\033[0m' |
15 | | - |
16 | | - |
17 | | -def normalize(text): #normalize to unicode, latin letters |
18 | | - return ( |
19 | | - unicodedata.normalize('NFKD', text.upper()) |
20 | | - .replace('İ', 'I') |
21 | | - .replace('Ş', 'S') |
22 | | - .replace('Ğ', 'G') |
23 | | - .replace('Ü', 'U') |
24 | | - .replace('Ö', 'O') #İYİLİK - IYILIG |
25 | | - .replace('Ç', 'C') |
26 | | - .encode('ASCII', 'ignore') |
27 | | - .decode('utf-8') |
28 | | - ) |
29 | | -def listCSV(lst): |
30 | | - listOut = [] |
31 | | - for file in lst: |
32 | | - if file.endswith(".csv"): |
33 | | - listOut.append(file) |
34 | | - else: |
35 | | - continue |
36 | | - return listOut |
37 | | - |
38 | | -def levenshtein(word, target): |
39 | | - word = normalize(word) |
40 | | - target = normalize(target) #Normalize input |
41 | | - dist = lv.distance(word, target) |
42 | | - if dist <= numTolerate: |
43 | | - return True |
| 1 | +import os |
| 2 | +import pandas as pd |
| 3 | + |
| 4 | +folderBase = os.getcwd() |
| 5 | +dataFolderName = "cleanData" |
| 6 | +dataFolderPath = os.path.join(folderBase,dataFolderName) |
| 7 | +outputData_clean = pd.DataFrame(columns=["Participant ID","Participant Group","ListID","Presented Word","Present Position","Recalled Word","Recall Position","Reaction Time","Hit"]) |
| 8 | +outputData_intrusion = pd.DataFrame(columns=["Participant ID","Participant Group","ListID","Presented Word","Present Position","Recalled Word","Recall Position","Reaction Time","Hit"]) |
| 9 | + |
| 10 | +for file in os.listdir(dataFolderPath): |
| 11 | + participantId = file.split("-")[0] |
| 12 | + participantGroup = file.split("-")[1][:3] |
| 13 | + inputData = pd.read_csv(os.path.join(dataFolderPath,file)) |
| 14 | + |
| 15 | + if "clean" in file: |
| 16 | + outputData_clean = pd.concat([inputData, outputData_clean], ignore_index=True) |
| 17 | + outputData_clean["Participant ID"].fillna(f"{participantId}",inplace=True) |
| 18 | + outputData_clean["Participant Group"].fillna(f"{participantGroup}",inplace=True) |
44 | 19 |
|
| 20 | + if "intrusion" in file: |
| 21 | + outputData_intrusion = pd.concat([inputData, outputData_intrusion], ignore_index=True) |
| 22 | + outputData_intrusion["Participant ID"].fillna(f"{participantId}",inplace=True) |
| 23 | + outputData_intrusion["Participant Group"].fillna(f"{participantGroup}",inplace=True) |
45 | 24 |
|
46 | | -def convertLetterstoNumbers(lst): #For converting the letter inputs into values we can iterate through, |
47 | | - #and that will match with the indexing of the data |
48 | | - columnsNumbers.clear() |
49 | | - for str in lst: |
50 | | - idX = cifs(str)-1 |
51 | | - columnsNumbers.append(idX) |
52 | | - print(f"Columns converted to index numbers. \n Numbers:: {columnsNumbers}") |
53 | | - |
54 | | -def dropColumns(x): #for removing the columns we did not want |
55 | | - tempRemove=[] |
56 | | - for i in range(len(x.columns)): |
57 | | - if i not in columnsNumbers: |
58 | | - tempRemove.append(x.columns[i]) |
59 | | - else: |
60 | | - continue |
61 | | - for i in tempRemove: |
62 | | - x = x.drop(columns=[f"{i}"]) |
63 | | - print("Columns Dropped!") |
64 | | - return x |
65 | | - |
66 | | -def columnStrip(df,col,char): |
67 | | - length = len(df[col]) |
68 | | - for i in range(length): |
69 | | - df.loc[i, col] = df.loc[i, col].strip(char) |
70 | | - return df |
71 | | - |
72 | | -def findColumnTypes(df, searchedType): |
73 | | - temp = [] #returns names of columns with desired ending |
74 | | - for column in df.columns: |
75 | | - if column.endswith(searchedType): |
76 | | - temp.append(column) |
77 | | - return temp |
78 | | - |
79 | | -def isOneCellMP(cell): #checks if a single cell is one cell multiple response type |
80 | | - |
81 | | - if type(cell) == str: |
82 | | - splitList = cell.split(" ") |
83 | | - if len(splitList) > 1 and len(splitList[1]) > 1: |
84 | | - return True |
85 | | - else: |
86 | | - return False |
87 | | - |
88 | | -def arrayCombine(array): #combines the inputs in the array |
89 | | - rows = [] #to be used for extracting rt values |
90 | | - combined = [] |
91 | | - for i in range(len(array)): |
92 | | - cell = array[i] |
93 | | - if isOneCellMP(cell) == True: |
94 | | - dct = cell.split(" ") |
95 | | - for word in dct: |
96 | | - combined.append(word) |
97 | | - rows.append(i) |
98 | | - |
99 | | - if isOneCellMP(cell) == False and type(cell) == str: |
100 | | - combined.append(cell) |
101 | | - rows.append(i) |
| 25 | + outputData_clean.to_csv(os.path.join(dataFolderPath,"CombinedClean.csv"), index=False, encoding="utf-8-sig") |
| 26 | + outputData_intrusion.to_csv(os.path.join(dataFolderPath,"CombinedIntrusion.csv"), index=False, encoding="utf-8-sig") |
102 | 27 |
|
103 | | - return combined, rows |
104 | | -# def indexOneCellMultipleResponse(array): |
105 | | -# tmp = array |
106 | | -# output = [] |
107 | | -# for i in range(len(tmp)): |
108 | | -# cell = tmp[i] |
109 | | -# if isOneCellMP(cell) == True: |
110 | | -# output.append(i) |
111 | | -# else: continue |
112 | 28 |
|
113 | | -# return output |
114 | | - |
115 | | -# def OneCellDeconstruct(array): |
116 | | -# newArray = [] |
117 | | -# for cellnum in indexOneCellMultipleResponse(array): |
118 | | -# cellSplit = array[cellnum].split(" ") |
119 | | -# for word in cellSplit: |
120 | | -# newArray.append(word) |
121 | | -# return newArray |
122 | | - |
123 | | -def createRecallRtimeTable(): |
124 | | - RecallRtimeTable = pd.DataFrame(columns=["ListID","Recalled Word","Recall Position","Reaction Time","RT Viable"]) |
125 | | - return RecallRtimeTable |
126 | | - |
127 | | -def fillRecallRtimeTable(df,table,RecallEnd,ReactionEnd): |
128 | | - temp = table |
129 | | - recallColumnNames = findColumnTypes(df, RecallEnd) |
130 | | - pushtoRow = 0 #which row to push to |
131 | | - for ncol in range(len(recallColumnNames)): #iterate through the recall cols list |
132 | | - recallCol = recallColumnNames[ncol] #get the name, ncol is used to have the listID |
133 | | - array = df[recallCol] #set the array |
134 | | - recallWords, rows = arrayCombine(array) #get combined array. with row ids to use in getting reaction times. |
135 | | - for i in range(len(recallWords)): |
136 | | - temp.loc[pushtoRow, "ListID"] = ncol |
137 | | - temp.loc[pushtoRow, "Recalled Word"] = recallWords[i] |
138 | | - temp.loc[pushtoRow, "Recall Position"] = i+1 |
139 | | - temp.loc[pushtoRow, "Reaction Time"] = rows[i] |
140 | | - pushtoRow += 1 |
141 | | - |
142 | | - |
143 | | - reactionColumnNames = findColumnTypes(df, ReactionEnd) #take the columns with endind |
144 | | - for i in range(len(temp["Recalled Word"])): |
145 | | - gettable = temp["ListID"][i] #for the given row, take the table id |
146 | | - getindex = temp["Reaction Time"][i] #for the given row, take the row number from reaction time column. where the previous function wrote. |
147 | | - reactionCol = df[reactionColumnNames[gettable]] #take the relevant reaction time column |
148 | | - temp.loc[i,"Reaction Time"] = reactionCol[getindex] #take the value |
149 | | - |
150 | | - temp = columnStrip(temp, "Recalled Word", "\n") |
151 | | - |
152 | | - return temp |
153 | | - |
154 | | -def whichRowstoLists(df, columnName: str, listNum: int): |
155 | | - """ |
156 | | - Returns the row indexes where a list starts and ends. |
157 | | - [start, end, start, end...] |
158 | | -
|
159 | | - Args: |
160 | | - columnName: string type name, where the function will work |
161 | | - listNum: Number of lists to look for, there is an overflow problem of one extra list. Used for that. Exclude trial list if you have one on this count. |
162 | | - |
163 | | - """ |
164 | | - currentListCount = 0 |
165 | | - rowNumbersList = [] |
166 | | - length = len(df[columnName]) |
167 | | - array = df[columnName] |
168 | | - |
169 | | - for i in range(length-1): |
170 | | - Left = array[i] |
171 | | - Right = array[i+1] |
172 | | - if pd.isna(Left) == True and pd.isna(Right) == False: |
173 | | - if currentListCount <= listNum: |
174 | | - rowNumbersList.append(i+1) |
175 | | - else: |
176 | | - continue |
177 | | - if pd.isna(Left) == False and pd.isna(Right) == True: |
178 | | - if currentListCount <= listNum: |
179 | | - rowNumbersList.append(i) |
180 | | - currentListCount += 1 |
181 | | - else: |
182 | | - continue |
183 | | - return rowNumbersList |
184 | | - |
185 | | -def createWordPresentTable(): |
186 | | - return pd.DataFrame(columns=["ListID","Words Presented","Present Position"]) |
187 | | - |
188 | | -def fillWordPresentTable(df, columnName:str, listNum: int, table): |
189 | | - |
190 | | - rowNums = whichRowstoLists(df, columnName, listNum) |
191 | | - array = df[columnName] |
192 | | - |
193 | | - currentListIndexer = 0 #to generate indexes from rowNums lit |
194 | | - currentListIdentifier = 0 #to write at column |
195 | | - builderRowNum = 0 #for using at .loc |
196 | | - |
197 | | - for x in range(listNum+1): |
198 | | - stratRowNum = int(rowNums[currentListIndexer]) |
199 | | - endRowNum = int(rowNums[currentListIndexer+1]) |
200 | | - |
201 | | - for i in range(stratRowNum,endRowNum+1): |
202 | | - table.loc[builderRowNum, "ListID"] = currentListIdentifier |
203 | | - table.loc[builderRowNum, "Words Presented"] = array[i] |
204 | | - table.loc[builderRowNum, "Present Position"] = (i - stratRowNum) + 1 |
205 | | - builderRowNum += 1 |
206 | | - |
207 | | - currentListIndexer += 2 |
208 | | - currentListIdentifier += 1 |
209 | | - return table |
210 | | - |
211 | | -def mergeTables(wordPresent, recall): |
212 | | - newTable = pd.DataFrame(columns=["ListID","Presented Word","Present Position","Recalled Word","Recall Position","Reaction Time","Hit"]) |
213 | | - remainderRecall = recall |
214 | | - currentRow = 0 |
215 | | - wordsCol = wordPresent["Words Presented"] |
216 | | - recallCol = recall["Recalled Word"] |
217 | | - |
218 | | - for i in range(len(wordsCol)): |
219 | | - presented = normalize(wordsCol[i]) |
220 | | - presentedListRelation = wordPresent["ListID"][i] |
221 | | - |
222 | | - newTable.loc[currentRow, "ListID"] = wordPresent["ListID"][i] |
223 | | - newTable.loc[currentRow, "Presented Word"] = wordPresent["Words Presented"][i] |
224 | | - newTable.loc[currentRow, "Present Position"] = wordPresent["Present Position"][i] |
225 | | - |
226 | | - found = False |
227 | | - |
228 | | - for x in range(len(recallCol)): |
229 | | - recalled = normalize(recallCol[x]) |
230 | | - recallListRelation = recall["ListID"][x] |
231 | | - |
232 | | - if recalled == presented and recallListRelation == presentedListRelation and found == False: |
233 | | - newTable.loc[currentRow, "Recalled Word"] = recall["Recalled Word"][x] |
234 | | - newTable.loc[currentRow, "Recall Position"] = recall["Recall Position"][x] |
235 | | - newTable.loc[currentRow, "Reaction Time"] = recall["Reaction Time"][x] |
236 | | - newTable.loc[currentRow, "Hit"] = 1 # true hit |
237 | | - remainderRecall = remainderRecall.drop(index = x) #to take in remaining values |
238 | | - currentRow += 1 |
239 | | - found = True |
240 | | - |
241 | | - |
242 | | - if recalled == presented and recallListRelation != presentedListRelation and found == False: |
243 | | - newTable.loc[currentRow, "Recalled Word"] = recall["Recalled Word"][x] |
244 | | - newTable.loc[currentRow, "Recall Position"] = recall["Recall Position"][x] |
245 | | - newTable.loc[currentRow, "Reaction Time"] = recall["Reaction Time"][x] |
246 | | - newTable.loc[currentRow, "Hit"] = 2 #list intrusion |
247 | | - remainderRecall = remainderRecall.drop(index = x) #to take in remaining values |
248 | | - currentRow += 1 |
249 | | - found = True |
250 | | - |
251 | | - if found == False: |
252 | | - newTable.loc[currentRow, "Hit"] = 0 #list intrusion |
253 | | - currentRow += 1 |
254 | | - |
255 | | - return newTable, remainderRecall |
256 | | - |
257 | | -###Define folders |
258 | | -folderBase = os.getcwd() #/../psychopy-recall-suffix |
259 | | -folderDataRaw = os.path.join(folderBase, "data") |
260 | | -folderRawCSV = listCSV(os.listdir(folderDataRaw)) |
261 | | - |
262 | | -#have config |
263 | | -pathConfig = os.path.join(folderBase,"config.ini") |
264 | | -config = configparser.ConfigParser() |
265 | | -config.read(pathConfig) |
266 | | - |
267 | | -numTolerate = int(config["DEFAULT"]["numTolerate"]) |
268 | | -numLists = int(config["DEFAULT"]["numLists"]) # Need to match key name |
269 | | -columnsExtract = config["DEFAULT"].get("columnsExtract").replace(" ", "").split(",") |
270 | | -trialyes = int(config["DEFAULT"]["trialyes"]) |
271 | | - |
272 | | -if "cleanData" not in os.listdir(folderBase): #create the cleanData folder |
273 | | - os.mkdir("cleanData") |
274 | | - |
275 | | -folderDataClean = os.path.join(folderBase, "cleanData") |
276 | | - |
277 | | -for file in folderRawCSV: #start the loop, it starts if a data is not already cleaned |
278 | | - |
279 | | - fileCleaned = file.strip(".csv") + "_clean.csv" |
280 | | - fileRemainder = file.strip(".csv") + "_intrusions.csv" |
281 | | - filePath = os.path.join(folderDataRaw, file) |
282 | | - |
283 | | - if fileCleaned not in os.listdir(folderDataClean): |
284 | | - #left here, continue from |
285 | | - dataInput = pd.read_csv(filePath) #take file |
286 | | - |
287 | | - convertLetterstoNumbers(columnsExtract) #Now we can turn it to numbers. |
288 | | - #the number array is returned to a global list created at the start. |
289 | | - #that number array is fed into dropColumns |
290 | | - dataInput = dropColumns(dataInput) |
291 | | - |
292 | | - recallTable = fillRecallRtimeTable(dataInput, createRecallRtimeTable(),".text",".rt") |
293 | | - wordsTable = fillWordPresentTable(dataInput, "Words", 4, createWordPresentTable()) |
294 | | - |
295 | | - table, remainder = mergeTables(wordsTable,recallTable) |
296 | | - |
297 | | - #save the files |
298 | | - |
299 | | - output_path = os.path.join(folderDataClean,fileCleaned) |
300 | | - |
301 | | - table.to_csv(output_path, index=False, encoding="utf-8-sig") |
302 | | - remainder.to_csv(os.path.join(folderDataClean,fileRemainder)) |
303 | | - |
304 | | - print(f"{GREEN}File created!{fileCleaned, fileRemainder}{RESET}") |
305 | | - |
306 | | - |
307 | | - if fileCleaned in os.listdir(folderDataClean): |
308 | | - print(f"{BLUE}File with name: {fileCleaned}, already exists in cleanData as {file}_clean.csv{RESET}") |
309 | | - |
310 | | - continue |
311 | | - |
0 commit comments