This notebook runs on Paperspace’s GPU.

# for working with paths in Python, I recommend using `pathlib.Path`
from pathlib import Path

cred_path = Path('~/.kaggle/kaggle.json').expanduser()
if not cred_path.exists():
    cred_path.parent.mkdir(exist_ok=True)
    cred_path.write_text(creds)
    cred_path.chmod(0o600)

path = Path('us-patent-phrase-to-phrase-matching')

if not iskaggle and not path.exists():
    import zipfile,kaggle
    kaggle.api.competition_download_cli(str(path))
    zipfile.ZipFile(f'{path}.zip').extractall(path)

Downloading us-patent-phrase-to-phrase-matching.zip to /notebooks

100%|██████████| 682k/682k [00:00<00:00, 27.8MB/s]

!ls {path}

sample_submission.csv  test.csv  train.csv

import pandas as pd
df = pd.read_csv(path/'train.csv')
df['input'] = 'TEXT1: ' + df.context + '; TEXT2: ' + df.target + '; ANC1: ' + df.anchor

df.input.head()

0    TEXT1: A47; TEXT2: abatement of pollution; ANC...
1    TEXT1: A47; TEXT2: act of abating; ANC1: abate...
2    TEXT1: A47; TEXT2: active catalyst; ANC1: abat...
3    TEXT1: A47; TEXT2: eliminating process; ANC1: ...
4    TEXT1: A47; TEXT2: forest region; ANC1: abatement
Name: input, dtype: object

from datasets import Dataset,DatasetDict

ds = Dataset.from_pandas(df)

ds

Dataset({
    features: ['id', 'anchor', 'target', 'context', 'score', 'input'],
    num_rows: 36473
})

model_nm = 'microsoft/deberta-v3-small'
from transformers import AutoModelForSequenceClassification,AutoTokenizer
tokz = AutoTokenizer.from_pretrained(model_nm)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
/usr/local/lib/python3.9/dist-packages/transformers/convert_slow_tokenizer.py:434: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
  warnings.warn(
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

tokz.tokenize("Testing out some lil' odd words such as see ya later, hadn't seen him, Capitalize, ChAngeItUp")

['▁Testing',
 '▁out',
 '▁some',
 '▁lil',
 "'",
 '▁odd',
 '▁words',
 '▁such',
 '▁as',
 '▁see',
 '▁ya',
 '▁later',
 ',',
 '▁hadn',
 "'",
 't',
 '▁seen',
 '▁him',
 ',',
 '▁Capital',
 'ize',
 ',',
 '▁Ch',
 'A',
 'nge',
 'It',
 'Up']

def tok_func(x): return tokz(x["input"])
tok_ds = ds.map(tok_func, batched=True)

Parameter 'function'=<function tok_func at 0x7f84b0ecd820> of the transform datasets.arrow_dataset.Dataset._map_single couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.

# input ids for the first row of data
row = tok_ds[0]
row['input'], row['input_ids']

('TEXT1: A47; TEXT2: abatement of pollution; ANC1: abatement',
 [1,
  54453,
  435,
  294,
  336,
  5753,
  346,
  54453,
  445,
  294,
  47284,
  265,
  6435,
  346,
  23702,
  435,
  294,
  47284,
  2])

tok_ds = tok_ds.rename_columns({'score':'labels'})

# Create validation dataset
eval_df = pd.read_csv(path/'test.csv')

import numpy as np, matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

dds = tok_ds.train_test_split(0.25, seed=42)
dds

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
    - Avoid using `tokenizers` before the fork if possible
    - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
    - Avoid using `tokenizers` before the fork if possible
    - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)

DatasetDict({
    train: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 27354
    })
    test: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9119
    })
})

# create the input column for the validation set
eval_df['input'] = 'TEXT1: ' + eval_df.context + '; TEXT2: ' + eval_df.target + '; ANC1: ' + eval_df.anchor
eval_ds = Dataset.from_pandas(eval_df).map(tok_func, batched=True)

# using numpy for correlation (Persons)
def corr(x,y): return np.corrcoef(x,y)[0][1]

# Plot function to visualize correlations
def show_corr(df, a, b):
    x,y = df[a],df[b]
    plt.scatter(x,y, alpha=0.5, s=4)
    plt.title(f'{a} vs {b}; r: {corr(x, y):.2f}')

from transformers import TrainingArguments,Trainer
bs = 128
epochs = 4

lr = 8e-5

args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.01, report_to='none')

def corr_d(eval_pred): return {'pearson': corr(*eval_pred)}

model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)
trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokz, compute_metrics=corr_d)

Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using cuda_amp half precision backend

trainer.train();

The following columns in the training set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: anchor, input, id, target, context. If anchor, input, id, target, context are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
/usr/local/lib/python3.9/dist-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
  warnings.warn(
***** Running training *****
  Num examples = 27354
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 856

[856/856 15:18, Epoch 4/4]

Epoch	Training Loss	Validation Loss	Pearson
1	No log	0.026263	0.798741
2	No log	0.025944	0.823264
3	0.034800	0.022987	0.833240
4	0.034800	0.021924	0.833892

The following columns in the evaluation set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: anchor, input, id, target, context. If anchor, input, id, target, context are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9119
  Batch size = 256
The following columns in the evaluation set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: anchor, input, id, target, context. If anchor, input, id, target, context are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9119
  Batch size = 256
Saving model checkpoint to outputs/checkpoint-500
Configuration saved in outputs/checkpoint-500/config.json
Model weights saved in outputs/checkpoint-500/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-500/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: anchor, input, id, target, context. If anchor, input, id, target, context are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9119
  Batch size = 256
The following columns in the evaluation set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: anchor, input, id, target, context. If anchor, input, id, target, context are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9119
  Batch size = 256


Training completed. Do not forget to share your model on huggingface.co/models =)

# get predictions on the test set
preds = trainer.predict(eval_ds).predictions.astype(float)
preds

The following columns in the test set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: anchor, id, input, target, context. If anchor, id, input, target, context are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 36
  Batch size = 256

[1/1 : < :]

array([[ 5.74218750e-01],
       [ 6.59179688e-01],
       [ 5.41992188e-01],
       [ 3.12255859e-01],
       [-3.18908691e-02],
       [ 5.43945312e-01],
       [ 5.07324219e-01],
       [ 7.92694092e-03],
       [ 2.51464844e-01],
       [ 1.04882812e+00],
       [ 3.00537109e-01],
       [ 2.63671875e-01],
       [ 7.12402344e-01],
       [ 8.55957031e-01],
       [ 7.36816406e-01],
       [ 4.27490234e-01],
       [ 2.95166016e-01],
       [-6.78062439e-04],
       [ 6.18164062e-01],
       [ 3.39843750e-01],
       [ 4.55566406e-01],
       [ 2.38769531e-01],
       [ 9.44213867e-02],
       [ 2.19604492e-01],
       [ 5.26855469e-01],
       [-2.81066895e-02],
       [-4.91638184e-02],
       [-2.97546387e-02],
       [-4.06188965e-02],
       [ 5.79589844e-01],
       [ 3.13232422e-01],
       [ 1.97219849e-03],
       [ 8.07617188e-01],
       [ 4.92431641e-01],
       [ 4.26513672e-01],
       [ 2.25585938e-01]])

preds = np.clip(preds, 0, 1)
preds

array([[0.57421875],
       [0.65917969],
       [0.54199219],
       [0.31225586],
       [0.        ],
       [0.54394531],
       [0.50732422],
       [0.00792694],
       [0.25146484],
       [1.        ],
       [0.30053711],
       [0.26367188],
       [0.71240234],
       [0.85595703],
       [0.73681641],
       [0.42749023],
       [0.29516602],
       [0.        ],
       [0.61816406],
       [0.33984375],
       [0.45556641],
       [0.23876953],
       [0.09442139],
       [0.21960449],
       [0.52685547],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.57958984],
       [0.31323242],
       [0.0019722 ],
       [0.80761719],
       [0.49243164],
       [0.42651367],
       [0.22558594]])