
python中通常使用lance句柄来解析如下所示import lance lance.dataset()但假如一个数据中存在子数据块而每个子数据块都是lance数据格式文件层级如下data | | |_ _ _data1(lance格式) | | |_ _ _data2(lance格式) | | |_ _ _data3(lance格式)那么在解析时为提速会使用多进程进行解析问题一若遇到多进程解析卡死问题需局部拉起spawn上下文import os import multiprocessing from concurrent.futures import ProcessPoolExecutor def parser_data(data_path): info lance.dataset() return info sub_data_dir [os.path.join(data, f) for f in os.listdir(data_dir)] # 若遇到多进程解析卡死问题需局部拉起spawn上下文 mp_text multiprocessing..get_context(spawn) with ProcessPoolExecutor(max_workersos.cpu_count()mp_contextmp_text) as executor: futures [executor.submit(parser_data, path) for path in sub_data_dir] for future in as_completed(futures): result future.result() print(result)问题二若主进程调用过lance句柄子进程中则不能调用否则也会存在卡住问题需要将主进程调用处更改为独立进程进行封装import os import multiprocessing from concurrent.futures import ProcessPoolExecutor def parser_data(data_path): info lance.dataset() return info sub_data_dir [os.path.join(data, f) for f in os.listdir(data_dir)] # 此处相当于在多进程解析之前就调用过lance句柄会有卡住现象存在 # 需更改为独立进程运行以下给出粗略方法解决 # parser_data(data_path_0) with ProcessPoolExecutor(max_workers1) as executor: futures executor.submit(parser_data, data_path_0) # 若遇到多进程解析卡死问题需局部拉起spawn上下文 mp_text multiprocessing..get_context(spawn) with ProcessPoolExecutor(max_workersos.cpu_count()mp_contextmp_text) as executor: futures [executor.submit(parser_data, path) for path in sub_data_dir] for future in as_completed(futures): result future.result() print(result)