J'ai plusieurs fichiers txt qui contiennent des informations CDR, les CDR sont distribués dans plusieurs fichiers. J'ai besoin de trouver des téléphones dans ces fichiers, puis de vérifier les correspondances avec le fichier xls. J'ai écrit une version à thread unique, puis multithread et j'ai trouvé que parfois le multithread est plus lent que le thread unique.
Multithread:
import re
import os
import time
import sys
import pandas
import ipaddress
def improve_view_n(string_to_improve):
string_to_improve = string_to_improve.split(',')
improved_string = ""
for i in string_to_improve:
if i != "":
improved_string = improved_string + i + " "
return improved_string
def upload_number_list():
numbers = []
try:
with open(file="number_list.txt", mode="r") as f:
for i in f:
numbers.append(i.strip("\\\n"))
except FileNotFoundError:
print("number_list.txt file does not exist or corrupted.\n\n")
print("The program will be terminated in 5 seconds")
time.sleep(5)
sys.exit()
return numbers
def search_for_pattern(number):
found_ip = []
our_files = ('y.py', "found_results.txt", "number_list.txt", 'norma.xls')
list_files = os.listdir()
for file_name in list_files:
#if file_name not in our_files:
if file_name.startswith("MSK"):
with open(file=file_name, mode='r') as f:
text_of_file = f.read()
results = re.findall(pattern=f",,,,,.*{number}.*,", string=text_of_file)
if results:
for element in results:
write_searh_results_to_file(file_name, element)
element = improve_view_n(element).split()
for subeleement in element:
try:
ipaddress.IPv4Address(subeleement)
except ipaddress.AddressValueError:
pass
else:
found_ip.append(subeleement)
else:
nothing_was_found(file_name, number)
return found_ip
def write_searh_results_to_file(file_where_match_was_found, element):
with open(file="found_results.txt", mode='a') as f:
f.write(f"{file_where_match_was_found}: {improve_view_n(element)} \n")
def nothing_was_found(file_where_match_wasnt_found, number_to_search):
with open(file="found_results.txt", mode='a') as f:
f.write(f"NO MATCHES FOUND FOR {number_to_search} IN {file_where_match_wasnt_found}\n\n")
def check_if_ip_in_norma(ip, trunk_names):
line_which_contains_ip = []
for line in trunk_names:
if ip in line:
line_which_contains_ip.append(line)
if line_which_contains_ip == []:
line_which_contains_ip.append(f"Norma does not contain information about {ip}")
return line_which_contains_ip
def main():
t1 = int(round(time.time() * 1000))
found_ip_lists = []
found_ip_list = []
if "norma.xls" not in os.listdir():
print("norma.xls file was not found in the current directory")
print("The program will be terminated")
sys.exit()
time.sleep(3)
normafile = pandas.read_excel('norma.xls', skiprows=2, header=None)
trunk_names = normafile[2]
numbers_to_search_list = upload_number_list()
for i in numbers_to_search_list:
found_ip_lists.append(search_for_pattern(i))
for i in found_ip_lists:
found_ip_list += i
print(set(found_ip_list))
for ip in set(found_ip_list):
x = check_if_ip_in_norma(ip, trunk_names)
print(f"{x}\n")
with open('found_results.txt', 'a') as f:
f.write(f"{x}\n")
print("The program completed fine!")
print("Take found_results.txt from the current folder")
print("If you want to repeat search, remove found_results.txt")
t2 = int(round(time.time() * 1000))
print(f"Job is done within {t2 - t1} miliseconds")
time.sleep(90)
print("Bye!")
time.sleep(1)
if __name__ == '__main__':
try:
main()
except Exception as ex:
print("The following error happened:")
print(ex)
time.sleep(20)
Single Thread:
import re
import os
import time
import sys
import pandas
import ipaddress
import threading
def improve_view_n(string_to_improve):
string_to_improve = string_to_improve.split(',')
improved_string = ""
for i in string_to_improve:
if i != "":
improved_string = improved_string + i + " "
return improved_string
def upload_number_list():
numbers = []
try:
with open(file="number_list.txt", mode="r") as f:
for i in f:
numbers.append(i.strip("\\\n"))
except FileNotFoundError:
print("number_list.txt file does not exist or corrupted.\n\n")
print("The program will be terminated in 5 seconds")
time.sleep(5)
sys.exit()
return numbers
def search_for_pattern(number, file_name, semaphore, found_ip):
semaphore.acquire()
if file_name.startswith("MSK"):
with open(file=file_name, mode='r') as f:
text_of_file = f.read()
results = re.findall(pattern=f",,,,,.*{number}.*,", string=text_of_file)
if results:
for element in results:
write_searh_results_to_file(file_name, element)
element = improve_view_n(element).split()
for subeleement in element:
try:
ipaddress.IPv4Address(subeleement)
except ipaddress.AddressValueError:
pass
else:
found_ip.append(subeleement)
else:
nothing_was_found(file_name, number)
semaphore.release()
def write_searh_results_to_file(file_where_match_was_found, element):
with open(file="found_results.txt", mode='a') as f:
f.write(f"{file_where_match_was_found}: {improve_view_n(element)} \n")
def nothing_was_found(file_where_match_wasnt_found, number_to_search):
with open(file="found_results.txt", mode='a') as f:
f.write(f"NO MATCHES FOUND FOR {number_to_search} IN {file_where_match_wasnt_found}\n\n")
def check_if_ip_in_norma(ip, trunk_names):
line_which_contains_ip = []
for line in trunk_names:
if ip in line:
line_which_contains_ip.append(line)
if line_which_contains_ip == []:
line_which_contains_ip.append(f"Norma does not contain information about {ip}")
return line_which_contains_ip
def main():
threads = []
our_files = ('y.py', "found_results.txt", "number_list.txt", 'norma.xls', 'MultyThread.py')
list_files = os.listdir()
for file in our_files:
if file in list_files:
list_files.remove(file)
semaphore = threading.Semaphore(10)
t1 = int(round(time.time() * 1000))
found_ip_list = []
if "norma.xls" not in os.listdir():
print("norma.xls file was not found in the current directory")
print("The program will be terminated")
sys.exit()
time.sleep(3)
normafile = pandas.read_excel('norma.xls', skiprows=2, header=None)
trunk_names = normafile[2]
numbers_to_search_list = upload_number_list()
for number in numbers_to_search_list:
for file_number in range(len(list_files)):
threads.append(threading.Thread(target=search_for_pattern,
args=(number, list_files[file_number],
semaphore, found_ip_list,),)
)
threads[file_number].start()
for file_number in range(len(list_files)):
threads[file_number].join()
print(set(found_ip_list))
for ip in set(found_ip_list):
x = check_if_ip_in_norma(ip, trunk_names)
print(f"{x}\n")
with open('found_results.txt', 'a') as f:
f.write(f"{x}\n")
print("The program completed fine!")
print("Take found_results.txt from the current folder")
print("If you want to repeat search, remove found_results.txt")
t2 = int(round(time.time() * 1000))
print(f"Job is done within {t2 - t1} miliseconds")
time.sleep(90)
print("Bye!")
time.sleep(1)
if __name__ == '__main__':
try:
main()
except Exception as ex:
print("The following error happened:")
print(ex)
time.sleep(20)
3 Réponses :
Python ne prend pas en charge le vrai multi-threading, vous avez toujours le Global Interpreter Lock [ en savoir plus sur GIL ] qui ne permet que l'exécution d'une seule instruction à la fois. Donc, il n'y a vraiment qu'un seul thread plus le code ajouté pour gérer les threads, donc il sera plus lent dans la plupart des cas.
Les opérations d'E / S peuvent s'accélérer mais pas toujours. Le module multi-threading sert plus pour un type de programmation différent que par exemple la programmation asynchrone (pour laquelle python a également un module lien ). Si vous souhaitez voir une réelle amélioration des performances, vous devez utiliser le module multiprocesseur python qui ne souffre pas de GIL , mais l'échange de données entre deux processus est plus compliqué que l'utilisation de threads.
Python utilise le Global Interpreter Lock (GIL). Cela fait essentiellement de l'ensemble du processus une seule application threadée.
Le multithreading Python n'est utile que si vous êtes lié aux E / S. Si vous souhaitez paralléliser votre charge de travail, vous devez utiliser le multiprocessing . Il a une API similaire au multithreading sauf que les processus ne partagent pas la mémoire entre eux.
quand il s'agit de traiter des données avec le multithreading en python est plus lent car en fait python il utilise un seul thread (à cause de GIL) qui bascule entre certains "threads" pyhton, vérifiez ceci lien
il est plus lent à cause de l'heure de commutation
vous devriez utiliser le multitraitement
Veuillez marquer une réponse comme correcte si elle a résolu votre problème.
@ mrangry777 J'ai écrit une version avec un module multitraitement et cela fonctionne beaucoup plus lentement que le multithreading et un thread unique, le nouveau script ne fait pas correctement ce dont j'ai besoin (ne donnez pas correctement found_ip_list - donnez plutôt une liste vide), je vais essayer de résoudre le problème alors je clôturerai le problème si je n'ai aucune question. Merci pour le soutien.
Pouvez-vous poster le résultat de vos tests? Nombre d'itérations, taille des fichiers et temps d'exécution?
@ mrangry777 oui bien sûr, vous fournira les informations dès que vous aurez terminé les tests