Машинное обучение Azure с использованием spacy NLP

Создайте сквозной конвейер с помощью машинного обучения Azure для обучения и оценки

Предпосылки

Учетная запись Azure
Хранилище Azure
Служба машинного обучения Azure

Код

В этом примере кода показано, как создать и запустить конвейер обучения и вывода AML с помощью SDK.
Не реальная реализация
Код обучения и логического вывода является образцом и не готов к использованию в рабочей среде.
Контекст готов к использованию в производстве
Протестирован этот код в Python 3.8 с ядром Azure ML.
Позволяет настроить рабочее пространство для запуска
В приведенном ниже коде предполагается, что входные данные заданы ADLS gen2 с набором данных, созданным для ввода и вывода.
Когда у нас есть ввод и вывод в хранилище, любое потребляющее приложение может получить результаты.
Приведенный ниже код предназначен только для пакетной обработки.

import azureml.core
from azureml.core import Workspace, Datastore
ws = Workspace.from_config()

Далее следует настроить хранилище данных рабочей области по умолчанию.

# Default datastore 
def_data_store = ws.get_default_datastore()
# Get the blob storage associated with the workspace
def_blob_store = Datastore(ws, "workspaceblobstore")
# Get file storage associated with the workspace
def_file_store = Datastore(ws, "workspacefilestore")

Далее создайте вычислительный кластер

from azureml.core.compute import ComputeTarget, AmlCompute
compute_name = "cpu-cluster"
vm_size = "STANDARD_NC6"
if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print('Found compute target: ' + compute_name)
else:
    print('Creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(vm_size=vm_size,  # STANDARD_NC6 is GPU-enabled
                                                                min_nodes=0,
                                                                max_nodes=4)
    # create the compute target
    compute_target = ComputeTarget.create(
        ws, compute_name, provisioning_config)
    # Can poll for a minimum number of nodes and for a specific timeout.
    # If no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(
        show_output=True, min_node_count=None, timeout_in_minutes=20)
    # For a more detailed view of current cluster status, use the 'status' property
    print(compute_target.status.serialize())

Мы используем только кластер ЦП. Опция для графического процессора доступна при необходимости
Импортировать библиотеки AML

from azureml.core import Dataset
from azureml.data.dataset_factory import DataType
from azureml.pipeline.steps import PythonScriptStep
from azureml.data import OutputFileDatasetConfig
from azureml.core import Workspace, Datastore
datastore = ws.get_default_datastore()

Создание конфигурации среды для вычислений

from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core import Environment 
aml_run_config = RunConfiguration()
# `compute_target` as defined in "Azure Machine Learning compute" section above
aml_run_config.target = compute_target
USE_CURATED_ENV = True
if USE_CURATED_ENV :
    curated_environment = Environment.get(workspace=ws, name="AzureML-Tutorial")
    aml_run_config.environment = curated_environment
else:
    aml_run_config.environment.python.user_managed_dependencies = False
    
    # Add some packages relied on by data prep step
    aml_run_config.environment.python.conda_dependencies = CondaDependencies.create(
        conda_packages=['pandas','scikit-learn','seaborn','tqdm'], 
        pip_packages=['azureml-sdk', 'azureml-dataprep[fuse,pandas]','seaborn','tqdm', 'spacy'], 
        pin_sdk_version=False)

Теперь давайте напишем код train.py
Создайте новый файл как текстовый файл и переименуйте его в train.py.

import sys
import subprocess
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'spacy'])
subprocess.check_call([sys.executable, '-m', 'spacy', 'download', 'en_core_web_sm'])
import spacy
nlp = spacy.load('en_core_web_sm')
about_text = ('Gus Proto is a Python developer currently'
              ' working for a London-based Fintech'
              ' company. He is interested in learning'
              ' Natural Language Processing.')
about_doc = nlp(about_text)
sentences = list(about_doc.sents)
len(sentences)
for sentence in sentences:
    print (sentence)

Над кодом должна быть папка поезда
Если его нет, создайте новый
Вышеприведенный код не делает ничего, кроме вывода предложений.
Вышеприведенный код можно отредактировать в соответствии с вашим сценарием.
Также можно добавить код для добавления информации о запуске.
Далее мы настраиваем конвейер обучения

train_source_dir = "./train"
train_entry_point = "train.py"

    
train_step = PythonScriptStep(
    script_name=train_entry_point,
    source_directory=train_source_dir,
    ##arguments=["--input_data", ds_input],
    compute_target=compute_target, # , "--training_results", training_results
    runconfig=aml_run_config,
    allow_reuse=False
)

Создание шагов конвейера

# list of steps to run (`compare_step` definition not shown)
compare_models = [train_step]
from azureml.pipeline.core import Pipeline
# Build the pipeline
pipeline1 = Pipeline(workspace=ws, steps=train_step)

Проверка конвейера

pipeline1.validate()
print("Pipeline validation complete")

Запуск конвейера обучения

from azureml.core import Experiment
# Submit the pipeline to be run
pipeline_run1 = Experiment(ws, 'Spacy_Pipeline_Notebook').submit(pipeline1)
pipeline_run1.wait_for_completion()

Вывод дисплея

from azureml.widgets import RunDetails
RunDetails(pipeline_run1).show()

Затем создайте конвейер для переобучения с помощью Synapse Integration или фабрики данных Azure.

from azureml.pipeline.core.graph import PipelineParameter
pipeline_param = PipelineParameter(
  name="pipeline_arg",
  default_value=10)

опубликовать конвейер

published_pipeline1 = pipeline_run1.publish_pipeline(
     name="Published_Spacy_Pipeline_Notebook",
     description="Spacy_Pipeline_Notebook Published Pipeline Description",
     version="1.0")

Параметры конвейера не используются в приведенном выше обучающем скрипте только для того, чтобы показать, как пройти
Чтобы сначала выполнить указанный выше конвейер, нам нужно разрешение
Настройка субъекта-службы Azure
Предоставьте участникам доступ к рабочей области AML для запуска конвейера.

from azureml.core.authentication import TokenAuthentication, Audience
# This is a sample method to retrieve token and will be passed to TokenAuthentication
def get_token_for_audience(audience):
    from adal import AuthenticationContext
    client_id = "clientid"
    client_secret = "xxxxxxxxxxxxxxxx"
    tenant_id = "tenantid"
    auth_context = AuthenticationContext("https://login.microsoftonline.com/{}".format(tenant_id))
    resp = auth_context.acquire_token_with_client_credentials(audience,client_id,client_secret)
    token = resp["accessToken"]
    return token

token_auth = TokenAuthentication(get_token_for_audience=get_token_for_audience)

Создайте заголовок авторизации

headerInfo = {'Authorization': 'Bearer ' + aad_token + ''}

Теперь вызовите опубликованный конвейер

from azureml.pipeline.core import PublishedPipeline
import requests
response = requests.post(published_pipeline1.endpoint, 
                         headers=headerInfo,
                         json={"ExperimentName": "Published_Spacy_Pipeline_Notebook",
                               "ParameterAssignments": {"pipeline_arg": 20}})

Теперь перейдите на страницу эксперимента и дождитесь завершения эксперимента.
Как только эксперимент будет завершен, мы можем попробовать конвейер оценки.
Сначала создайте файлscore.py

import sys
import subprocess
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'spacy'])
subprocess.check_call([sys.executable, '-m', 'spacy', 'download', 'en_core_web_sm'])
import spacy
nlp = spacy.load('en_core_web_sm')
about_text = ('Gus Proto is a Python developer currently'
              ' working for a London-based Fintech'
              ' company. He is interested in learning'
              ' Natural Language Processing.')
about_doc = nlp(about_text)
sentences = list(about_doc.sents)
len(sentences)
for sentence in sentences:
    print (sentence)

Создайте конвейер вывода

train_source_dir = "./inference"
train_entry_point = "score.py"

    
train_step = PythonScriptStep(
    script_name=train_entry_point,
    source_directory=train_source_dir,
    ##arguments=["--input_data", ds_input],
    compute_target=compute_target, # , "--training_results", training_results
    runconfig=aml_run_config,
    allow_reuse=False
)

Создание шагов конвейера

# list of steps to run (`compare_step` definition not shown)
compare_models = [train_step]
from azureml.pipeline.core import Pipeline
# Build the pipeline
pipeline1 = Pipeline(workspace=ws, steps=train_step)

Проверка конвейера

pipeline1.validate()
print("Pipeline validation complete")

Отправьте запуск конвейера.
Перейти к эксперименту и дождаться завершения конвейера

from azureml.core import Experiment
# Submit the pipeline to be run
pipeline_run1 = Experiment(ws, 'Spacy_Pipeline_Inferencing').submit(pipeline1)
pipeline_run1.wait_for_completion()

Как только эксперимент увенчается успехом, значит, все в порядке.

Оригинал статьи — Samples2022/spacyetoe.md на главной · balakreshnan/Samples2022 (github.com)

Предложения по подаче заявок на Mlearning.ai
Как стать писателем на Mlearning.aimedium.com

Машинное обучение Azure с использованием spacy NLP

Создайте сквозной конвейер с помощью машинного обучения Azure для обучения и оценки

Предпосылки

Код

Вопросы по теме