2023-02-28 22:48:34 +00:00
import os
from datetime import datetime
import csv
import hashlib
import pandas as pd
CSV_DIR = os . path . join ( os . getcwd ( ) , ' csv ' )
def get_hashes_in_dir ( dir_path : str ) - > list :
hash_list = [ ]
2023-02-28 23:26:23 +00:00
for subdir , dirs , files in os . walk ( dir_path ) : # loop through all files in the directory and generate hashes
2023-02-28 22:48:34 +00:00
for file in files :
filepath = os . path . join ( subdir , file )
with open ( filepath , ' rb ' ) as f :
filehash = hashlib . sha256 ( f . read ( ) ) . hexdigest ( )
2023-02-28 23:09:33 +00:00
hash_list . append ( { ' filepath ' : filepath , ' filename ' : file , ' sha256 hash ' : filehash } )
2023-02-28 22:48:34 +00:00
return hash_list
2023-02-28 23:26:23 +00:00
def hash_submissions ( submissions_dir_path : str ) - > str :
2023-02-28 22:48:34 +00:00
os . makedirs ( CSV_DIR , exist_ok = True )
2023-02-28 23:26:23 +00:00
submissions_dir_name = os . path . abspath ( submissions_dir_path ) . split ( os . path . sep ) [ - 1 ] # get name of submission/assignment by separating path and use rightmost part
2023-02-28 22:48:34 +00:00
csv_file_name = f ' { submissions_dir_name } _file_hashes_ { datetime . now ( ) . strftime ( " % Y % m %d - % H % M % S " ) } .csv '
csv_file_path = os . path . join ( CSV_DIR , csv_file_name )
2023-02-28 23:26:23 +00:00
with open ( csv_file_path , ' w ' , newline = ' ' ) as csvfile : # open the output CSV file for writing
2023-02-28 23:09:33 +00:00
fieldnames = [ ' Student ID ' , ' filepath ' , ' filename ' , ' sha256 hash ' ]
2023-02-28 22:48:34 +00:00
writer = csv . DictWriter ( csvfile , fieldnames = fieldnames )
writer . writeheader ( )
2023-02-28 23:26:23 +00:00
for student_dir_name in os . listdir ( submissions_dir_path ) : # loop through each student dir to get hashes for all files per student
2023-02-28 22:48:34 +00:00
student_dir_path = os . path . join ( submissions_dir_path , student_dir_name )
2023-02-28 23:26:23 +00:00
hashes_dict = get_hashes_in_dir ( student_dir_path ) # dict with hashes for all student files
2023-02-28 22:48:34 +00:00
for d in hashes_dict :
d . update ( { ' Student ID ' : student_dir_name } ) # update hash records with student id
writer . writerows ( hashes_dict )
2023-03-01 10:51:03 +00:00
print ( f ' [INFO] Created CSV file with all files & hashes in { submissions_dir_name } \n CSV file: { csv_file_path } ' )
2023-02-28 22:48:34 +00:00
return csv_file_path
2023-02-28 23:26:23 +00:00
2023-02-28 22:48:34 +00:00
def get_suspicious_hashes ( df : pd . DataFrame ) - > list :
2023-02-28 23:26:23 +00:00
drop_columns = [ ' filepath ' , ' filename ' ] # only need to keep 'student id' and 'sha256 hash' for groupby later
2023-02-28 22:48:34 +00:00
df = df . drop ( columns = drop_columns ) . sort_values ( ' sha256 hash ' ) # clear not needed colums & sort by hash
duplicate_hash = df . loc [ df . duplicated ( subset = [ ' sha256 hash ' ] , keep = False ) , : ] # all files with duplicate hash - incl. files from the same student id
2023-02-28 23:26:23 +00:00
hash_with_multiple_student_ids = duplicate_hash . groupby ( ' sha256 hash ' ) . agg ( lambda x : len ( x . unique ( ) ) > 1 ) # true if more than 1 unique student ids (= files with the same hash by multiple student ids), false if unique student id (= files from the same student id with the same hash)
2023-02-28 22:48:34 +00:00
2023-02-28 23:26:23 +00:00
suspicious_hashes_list = hash_with_multiple_student_ids [ hash_with_multiple_student_ids [ ' Student ID ' ] == True ] . index . to_list ( ) # list with duplicate hashes - only if different student id (doesn't include files from same student id)
2023-02-28 22:48:34 +00:00
return suspicious_hashes_list
def suspicious_by_hash ( df : pd . DataFrame ) - > pd . DataFrame :
suspicious_hashes_list = get_suspicious_hashes ( df )
files_with_suspicious_hash = df [ df [ ' sha256 hash ' ] . isin ( suspicious_hashes_list ) ] # excluding duplicate from same student id
return files_with_suspicious_hash . sort_values ( [ ' sha256 hash ' , ' Student ID ' ] )