Skip to content

Commit 28013a0

Browse files
authored
Merge pull request #2 from meburak/newScript
New script
2 parents cfcc776 + 27e5b5b commit 28013a0

File tree

2 files changed

+52
-307
lines changed

2 files changed

+52
-307
lines changed

combine.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import os
2+
import pandas as pd
3+
4+
folderBase = os.getcwd()
5+
dataFolderName = "cleanData"
6+
dataFolderPath = os.path.join(folderBase,dataFolderName)
7+
outputData_clean = pd.DataFrame(columns=["Participant ID","Participant Group","ListID","Presented Word","Present Position","Recalled Word","Recall Position","Reaction Time","Hit"])
8+
outputData_intrusion = pd.DataFrame(columns=["Participant ID","Participant Group","ListID","Presented Word","Present Position","Recalled Word","Recall Position","Reaction Time","Hit"])
9+
10+
for file in os.listdir(dataFolderPath):
11+
participantId = file.split("-")[0]
12+
participantGroup = file.split("-")[1][:3]
13+
inputData = pd.read_csv(os.path.join(dataFolderPath,file))
14+
15+
if "clean" in file:
16+
outputData_clean = pd.concat([inputData, outputData_clean], ignore_index=True)
17+
outputData_clean["Participant ID"].fillna(f"{participantId}",inplace=True)
18+
outputData_clean["Participant Group"].fillna(f"{participantGroup}",inplace=True)
19+
20+
if "intrusion" in file:
21+
outputData_intrusion = pd.concat([inputData, outputData_intrusion], ignore_index=True)
22+
outputData_intrusion["Participant ID"].fillna(f"{participantId}",inplace=True)
23+
outputData_intrusion["Participant Group"].fillna(f"{participantGroup}",inplace=True)
24+
25+
outputData_clean.to_csv(os.path.join(dataFolderPath,"CombinedClean.csv"), index=False, encoding="utf-8-sig")
26+
outputData_intrusion.to_csv(os.path.join(dataFolderPath,"CombinedIntrusion.csv"), index=False, encoding="utf-8-sig")
27+
28+

demo.py

Lines changed: 24 additions & 307 deletions
Original file line numberDiff line numberDiff line change
@@ -1,311 +1,28 @@
1-
import os
2-
import pandas as pd
3-
import Levenshtein as lv
4-
import unicodedata
5-
import numpy as np
6-
from openpyxl.utils import column_index_from_string as cifs
7-
import configparser
8-
9-
columnsNumbers = []
10-
11-
RED = '\033[91m'
12-
GREEN = '\033[92m'
13-
BLUE = '\033[94m'
14-
RESET = '\033[0m'
15-
16-
17-
def normalize(text): #normalize to unicode, latin letters
18-
return (
19-
unicodedata.normalize('NFKD', text.upper())
20-
.replace('İ', 'I')
21-
.replace('Ş', 'S')
22-
.replace('Ğ', 'G')
23-
.replace('Ü', 'U')
24-
.replace('Ö', 'O') #İYİLİK - IYILIG
25-
.replace('Ç', 'C')
26-
.encode('ASCII', 'ignore')
27-
.decode('utf-8')
28-
)
29-
def listCSV(lst):
30-
listOut = []
31-
for file in lst:
32-
if file.endswith(".csv"):
33-
listOut.append(file)
34-
else:
35-
continue
36-
return listOut
37-
38-
def levenshtein(word, target):
39-
word = normalize(word)
40-
target = normalize(target) #Normalize input
41-
dist = lv.distance(word, target)
42-
if dist <= numTolerate:
43-
return True
1+
import os
2+
import pandas as pd
3+
4+
folderBase = os.getcwd()
5+
dataFolderName = "cleanData"
6+
dataFolderPath = os.path.join(folderBase,dataFolderName)
7+
outputData_clean = pd.DataFrame(columns=["Participant ID","Participant Group","ListID","Presented Word","Present Position","Recalled Word","Recall Position","Reaction Time","Hit"])
8+
outputData_intrusion = pd.DataFrame(columns=["Participant ID","Participant Group","ListID","Presented Word","Present Position","Recalled Word","Recall Position","Reaction Time","Hit"])
9+
10+
for file in os.listdir(dataFolderPath):
11+
participantId = file.split("-")[0]
12+
participantGroup = file.split("-")[1][:3]
13+
inputData = pd.read_csv(os.path.join(dataFolderPath,file))
14+
15+
if "clean" in file:
16+
outputData_clean = pd.concat([inputData, outputData_clean], ignore_index=True)
17+
outputData_clean["Participant ID"].fillna(f"{participantId}",inplace=True)
18+
outputData_clean["Participant Group"].fillna(f"{participantGroup}",inplace=True)
4419

20+
if "intrusion" in file:
21+
outputData_intrusion = pd.concat([inputData, outputData_intrusion], ignore_index=True)
22+
outputData_intrusion["Participant ID"].fillna(f"{participantId}",inplace=True)
23+
outputData_intrusion["Participant Group"].fillna(f"{participantGroup}",inplace=True)
4524

46-
def convertLetterstoNumbers(lst): #For converting the letter inputs into values we can iterate through,
47-
#and that will match with the indexing of the data
48-
columnsNumbers.clear()
49-
for str in lst:
50-
idX = cifs(str)-1
51-
columnsNumbers.append(idX)
52-
print(f"Columns converted to index numbers. \n Numbers:: {columnsNumbers}")
53-
54-
def dropColumns(x): #for removing the columns we did not want
55-
tempRemove=[]
56-
for i in range(len(x.columns)):
57-
if i not in columnsNumbers:
58-
tempRemove.append(x.columns[i])
59-
else:
60-
continue
61-
for i in tempRemove:
62-
x = x.drop(columns=[f"{i}"])
63-
print("Columns Dropped!")
64-
return x
65-
66-
def columnStrip(df,col,char):
67-
length = len(df[col])
68-
for i in range(length):
69-
df.loc[i, col] = df.loc[i, col].strip(char)
70-
return df
71-
72-
def findColumnTypes(df, searchedType):
73-
temp = [] #returns names of columns with desired ending
74-
for column in df.columns:
75-
if column.endswith(searchedType):
76-
temp.append(column)
77-
return temp
78-
79-
def isOneCellMP(cell): #checks if a single cell is one cell multiple response type
80-
81-
if type(cell) == str:
82-
splitList = cell.split(" ")
83-
if len(splitList) > 1 and len(splitList[1]) > 1:
84-
return True
85-
else:
86-
return False
87-
88-
def arrayCombine(array): #combines the inputs in the array
89-
rows = [] #to be used for extracting rt values
90-
combined = []
91-
for i in range(len(array)):
92-
cell = array[i]
93-
if isOneCellMP(cell) == True:
94-
dct = cell.split(" ")
95-
for word in dct:
96-
combined.append(word)
97-
rows.append(i)
98-
99-
if isOneCellMP(cell) == False and type(cell) == str:
100-
combined.append(cell)
101-
rows.append(i)
25+
outputData_clean.to_csv(os.path.join(dataFolderPath,"CombinedClean.csv"), index=False, encoding="utf-8-sig")
26+
outputData_intrusion.to_csv(os.path.join(dataFolderPath,"CombinedIntrusion.csv"), index=False, encoding="utf-8-sig")
10227

103-
return combined, rows
104-
# def indexOneCellMultipleResponse(array):
105-
# tmp = array
106-
# output = []
107-
# for i in range(len(tmp)):
108-
# cell = tmp[i]
109-
# if isOneCellMP(cell) == True:
110-
# output.append(i)
111-
# else: continue
11228

113-
# return output
114-
115-
# def OneCellDeconstruct(array):
116-
# newArray = []
117-
# for cellnum in indexOneCellMultipleResponse(array):
118-
# cellSplit = array[cellnum].split(" ")
119-
# for word in cellSplit:
120-
# newArray.append(word)
121-
# return newArray
122-
123-
def createRecallRtimeTable():
124-
RecallRtimeTable = pd.DataFrame(columns=["ListID","Recalled Word","Recall Position","Reaction Time","RT Viable"])
125-
return RecallRtimeTable
126-
127-
def fillRecallRtimeTable(df,table,RecallEnd,ReactionEnd):
128-
temp = table
129-
recallColumnNames = findColumnTypes(df, RecallEnd)
130-
pushtoRow = 0 #which row to push to
131-
for ncol in range(len(recallColumnNames)): #iterate through the recall cols list
132-
recallCol = recallColumnNames[ncol] #get the name, ncol is used to have the listID
133-
array = df[recallCol] #set the array
134-
recallWords, rows = arrayCombine(array) #get combined array. with row ids to use in getting reaction times.
135-
for i in range(len(recallWords)):
136-
temp.loc[pushtoRow, "ListID"] = ncol
137-
temp.loc[pushtoRow, "Recalled Word"] = recallWords[i]
138-
temp.loc[pushtoRow, "Recall Position"] = i+1
139-
temp.loc[pushtoRow, "Reaction Time"] = rows[i]
140-
pushtoRow += 1
141-
142-
143-
reactionColumnNames = findColumnTypes(df, ReactionEnd) #take the columns with endind
144-
for i in range(len(temp["Recalled Word"])):
145-
gettable = temp["ListID"][i] #for the given row, take the table id
146-
getindex = temp["Reaction Time"][i] #for the given row, take the row number from reaction time column. where the previous function wrote.
147-
reactionCol = df[reactionColumnNames[gettable]] #take the relevant reaction time column
148-
temp.loc[i,"Reaction Time"] = reactionCol[getindex] #take the value
149-
150-
temp = columnStrip(temp, "Recalled Word", "\n")
151-
152-
return temp
153-
154-
def whichRowstoLists(df, columnName: str, listNum: int):
155-
"""
156-
Returns the row indexes where a list starts and ends.
157-
[start, end, start, end...]
158-
159-
Args:
160-
columnName: string type name, where the function will work
161-
listNum: Number of lists to look for, there is an overflow problem of one extra list. Used for that. Exclude trial list if you have one on this count.
162-
163-
"""
164-
currentListCount = 0
165-
rowNumbersList = []
166-
length = len(df[columnName])
167-
array = df[columnName]
168-
169-
for i in range(length-1):
170-
Left = array[i]
171-
Right = array[i+1]
172-
if pd.isna(Left) == True and pd.isna(Right) == False:
173-
if currentListCount <= listNum:
174-
rowNumbersList.append(i+1)
175-
else:
176-
continue
177-
if pd.isna(Left) == False and pd.isna(Right) == True:
178-
if currentListCount <= listNum:
179-
rowNumbersList.append(i)
180-
currentListCount += 1
181-
else:
182-
continue
183-
return rowNumbersList
184-
185-
def createWordPresentTable():
186-
return pd.DataFrame(columns=["ListID","Words Presented","Present Position"])
187-
188-
def fillWordPresentTable(df, columnName:str, listNum: int, table):
189-
190-
rowNums = whichRowstoLists(df, columnName, listNum)
191-
array = df[columnName]
192-
193-
currentListIndexer = 0 #to generate indexes from rowNums lit
194-
currentListIdentifier = 0 #to write at column
195-
builderRowNum = 0 #for using at .loc
196-
197-
for x in range(listNum+1):
198-
stratRowNum = int(rowNums[currentListIndexer])
199-
endRowNum = int(rowNums[currentListIndexer+1])
200-
201-
for i in range(stratRowNum,endRowNum+1):
202-
table.loc[builderRowNum, "ListID"] = currentListIdentifier
203-
table.loc[builderRowNum, "Words Presented"] = array[i]
204-
table.loc[builderRowNum, "Present Position"] = (i - stratRowNum) + 1
205-
builderRowNum += 1
206-
207-
currentListIndexer += 2
208-
currentListIdentifier += 1
209-
return table
210-
211-
def mergeTables(wordPresent, recall):
212-
newTable = pd.DataFrame(columns=["ListID","Presented Word","Present Position","Recalled Word","Recall Position","Reaction Time","Hit"])
213-
remainderRecall = recall
214-
currentRow = 0
215-
wordsCol = wordPresent["Words Presented"]
216-
recallCol = recall["Recalled Word"]
217-
218-
for i in range(len(wordsCol)):
219-
presented = normalize(wordsCol[i])
220-
presentedListRelation = wordPresent["ListID"][i]
221-
222-
newTable.loc[currentRow, "ListID"] = wordPresent["ListID"][i]
223-
newTable.loc[currentRow, "Presented Word"] = wordPresent["Words Presented"][i]
224-
newTable.loc[currentRow, "Present Position"] = wordPresent["Present Position"][i]
225-
226-
found = False
227-
228-
for x in range(len(recallCol)):
229-
recalled = normalize(recallCol[x])
230-
recallListRelation = recall["ListID"][x]
231-
232-
if recalled == presented and recallListRelation == presentedListRelation and found == False:
233-
newTable.loc[currentRow, "Recalled Word"] = recall["Recalled Word"][x]
234-
newTable.loc[currentRow, "Recall Position"] = recall["Recall Position"][x]
235-
newTable.loc[currentRow, "Reaction Time"] = recall["Reaction Time"][x]
236-
newTable.loc[currentRow, "Hit"] = 1 # true hit
237-
remainderRecall = remainderRecall.drop(index = x) #to take in remaining values
238-
currentRow += 1
239-
found = True
240-
241-
242-
if recalled == presented and recallListRelation != presentedListRelation and found == False:
243-
newTable.loc[currentRow, "Recalled Word"] = recall["Recalled Word"][x]
244-
newTable.loc[currentRow, "Recall Position"] = recall["Recall Position"][x]
245-
newTable.loc[currentRow, "Reaction Time"] = recall["Reaction Time"][x]
246-
newTable.loc[currentRow, "Hit"] = 2 #list intrusion
247-
remainderRecall = remainderRecall.drop(index = x) #to take in remaining values
248-
currentRow += 1
249-
found = True
250-
251-
if found == False:
252-
newTable.loc[currentRow, "Hit"] = 0 #list intrusion
253-
currentRow += 1
254-
255-
return newTable, remainderRecall
256-
257-
###Define folders
258-
folderBase = os.getcwd() #/../psychopy-recall-suffix
259-
folderDataRaw = os.path.join(folderBase, "data")
260-
folderRawCSV = listCSV(os.listdir(folderDataRaw))
261-
262-
#have config
263-
pathConfig = os.path.join(folderBase,"config.ini")
264-
config = configparser.ConfigParser()
265-
config.read(pathConfig)
266-
267-
numTolerate = int(config["DEFAULT"]["numTolerate"])
268-
numLists = int(config["DEFAULT"]["numLists"]) # Need to match key name
269-
columnsExtract = config["DEFAULT"].get("columnsExtract").replace(" ", "").split(",")
270-
trialyes = int(config["DEFAULT"]["trialyes"])
271-
272-
if "cleanData" not in os.listdir(folderBase): #create the cleanData folder
273-
os.mkdir("cleanData")
274-
275-
folderDataClean = os.path.join(folderBase, "cleanData")
276-
277-
for file in folderRawCSV: #start the loop, it starts if a data is not already cleaned
278-
279-
fileCleaned = file.strip(".csv") + "_clean.csv"
280-
fileRemainder = file.strip(".csv") + "_intrusions.csv"
281-
filePath = os.path.join(folderDataRaw, file)
282-
283-
if fileCleaned not in os.listdir(folderDataClean):
284-
#left here, continue from
285-
dataInput = pd.read_csv(filePath) #take file
286-
287-
convertLetterstoNumbers(columnsExtract) #Now we can turn it to numbers.
288-
#the number array is returned to a global list created at the start.
289-
#that number array is fed into dropColumns
290-
dataInput = dropColumns(dataInput)
291-
292-
recallTable = fillRecallRtimeTable(dataInput, createRecallRtimeTable(),".text",".rt")
293-
wordsTable = fillWordPresentTable(dataInput, "Words", 4, createWordPresentTable())
294-
295-
table, remainder = mergeTables(wordsTable,recallTable)
296-
297-
#save the files
298-
299-
output_path = os.path.join(folderDataClean,fileCleaned)
300-
301-
table.to_csv(output_path, index=False, encoding="utf-8-sig")
302-
remainder.to_csv(os.path.join(folderDataClean,fileRemainder))
303-
304-
print(f"{GREEN}File created!{fileCleaned, fileRemainder}{RESET}")
305-
306-
307-
if fileCleaned in os.listdir(folderDataClean):
308-
print(f"{BLUE}File with name: {fileCleaned}, already exists in cleanData as {file}_clean.csv{RESET}")
309-
310-
continue
311-

0 commit comments

Comments
 (0)