Skip to content

Commit 65e4130

Browse files
authored
Merge pull request #30 from ting-hsuan-chen/main
Update tegenomesimulator mode 0 demo
2 parents 6fd194c + dec8ae9 commit 65e4130

File tree

2 files changed

+56
-278
lines changed

2 files changed

+56
-278
lines changed

workshop/tegenomesimulator/1_tegenomesimulator_m0.ipynb

Lines changed: 34 additions & 278 deletions
Original file line numberDiff line numberDiff line change
@@ -18,19 +18,13 @@
1818
},
1919
{
2020
"cell_type": "code",
21-
"execution_count": 1,
21+
"execution_count": null,
2222
"id": "d8d8aef7-39f5-400b-bf49-2b1856b64533",
2323
"metadata": {},
24-
"outputs": [
25-
{
26-
"name": "stdout",
27-
"output_type": "stream",
28-
"text": [
29-
"/powerplant/workspace/cflthc/script/calisim-examples-workshop-material/workshop/tegenomesimulator\n"
30-
]
31-
}
32-
],
24+
"outputs": [],
3325
"source": [
26+
"%%bash\n",
27+
"# assumming in the directory of \"calisim-examples-workshop-material/workshop/tegenomesimulator\"\n",
3428
"pwd"
3529
]
3630
},
@@ -50,6 +44,7 @@
5044
}
5145
],
5246
"source": [
47+
"%%bash\n",
5348
"cd data\n",
5449
"cat random_genome_chr_index.csv"
5550
]
@@ -80,6 +75,7 @@
8075
}
8176
],
8277
"source": [
78+
"%%bash\n",
8379
"grep \"^>\" combined_curated_TE_lib_ATOSZM_selected.fasta | head"
8480
]
8581
},
@@ -100,6 +96,7 @@
10096
}
10197
],
10298
"source": [
99+
"%%bash\n",
103100
"grep -c \"^>\" combined_curated_TE_lib_ATOSZM_selected.fasta"
104101
]
105102
},
@@ -113,136 +110,14 @@
113110
},
114111
{
115112
"cell_type": "code",
116-
"execution_count": 18,
113+
"execution_count": null,
117114
"id": "1b1a3113-3477-4d21-9b42-d69fce6f0965",
118115
"metadata": {
119116
"scrolled": true
120117
},
121-
"outputs": [
122-
{
123-
"name": "stdout",
124-
"output_type": "stream",
125-
"text": [
126-
"mkdir: cannot create directory ‘demo_m0’: File exists\n",
127-
"[2025-07-31 21:50:46.695228] TEgenomeSimulator started.\n",
128-
"[2025-07-31 21:50:46.695271] Arguments: {'mode': 0, 'to_mask': False, 'prefix': 'demo_m0_1_5', 'repeat': '../data/combined_curated_TE_lib_ATOSZM_selected.fasta', 'repeat2': None, 'maxcp': 5, 'mincp': 1, 'maxidn': 95, 'minidn': 80, 'maxsd': 20, 'minsd': 1, 'chridx': '../data/random_genome_chr_index.csv', 'genome': None, 'alpha': 0.5, 'beta': 0.7, 'intact': 1e-05, 'seed': 1, 'threads': 1, 'outdir': '.'}\n",
129-
"Output Directory: /powerplant/workspace/cflthc/script/calisim-examples-workshop-material/workshop/tegenomesimulator/demo_m0/TEgenomeSimulator_demo_m0_1_5_result\n",
130-
"Mode: 0\n",
131-
"Running Random Synthesized mode.\n",
132-
"Prefix: demo_m0_1_5\n",
133-
"Repeat: ../data/combined_curated_TE_lib_ATOSZM_selected.fasta\n",
134-
"Chromosome Index: ../data/random_genome_chr_index.csv\n",
135-
"Genome File: None\n",
136-
"Alpha: 0.5\n",
137-
"Beta: 0.7\n",
138-
"Max Copies: 5\n",
139-
"Min Copies: 1\n",
140-
"Upper bound of mean identity: 95\n",
141-
"Lower bound of mean identity: 80\n",
142-
"Upper bound of sd of mean identity: 20\n",
143-
"Lower bound of sd of mean ideneity: 1\n",
144-
"Max chance of intact insertion: 1e-05\n",
145-
"Seed: 1\n",
146-
"\n",
147-
"\n",
148-
"#########################################################\n",
149-
"### Prepare TE library table with simulation settings ###\n",
150-
"#########################################################\n",
151-
"Using repeat fasta file ../data/combined_curated_TE_lib_ATOSZM_selected.fasta\n",
152-
"Output directory set as /powerplant/workspace/cflthc/script/calisim-examples-workshop-material/workshop/tegenomesimulator/demo_m0/TEgenomeSimulator_demo_m0_1_5_result\n",
153-
"\n",
154-
"\n",
155-
"## Random chosing copy numbe for each TE family ##\n",
156-
"Maximum copy number set by user: 5\n",
157-
"Minimum copy number set by user: 5\n",
158-
"\n",
159-
"\n",
160-
"## Random chosing the averaged sequence identity for each TE family ##\n",
161-
"Maximum averaged sequence identity: 95\n",
162-
"Minimum averaged sequence identity: 80\n",
163-
"\n",
164-
"\n",
165-
"## Random chosing the standard deviation of averaged sequence identity for each TE family ##\n",
166-
"Maximum standard deviation of averaged sequence identity: 20\n",
167-
"Minimum standard deviation of averaged sequence identity: 1\n",
168-
"\n",
169-
"\n",
170-
"## Random chosing the proportion of INDEL to total SNP (dependant on sequence identity) for each TE family ##\n",
171-
"Maximum INDEL proportion set by default: 20\n",
172-
"Minimum INDEL proportion set by default: 5\n",
173-
"\n",
174-
"\n",
175-
"## Setting the length of std based on prior knowledge ##\n",
176-
"Length range of TSD for LTR retrotransposon set to: 5 - 5\n",
177-
"Length range of TSD for LINE set to: 5 - 20\n",
178-
"Length range of TSD for SINE set to: 5 - 20\n",
179-
"Length range of TSD for DTA set to: 5 - 8\n",
180-
"Length range of TSD for DTC set to: 2 - 4\n",
181-
"Length range of TSD for DTH set to: 3 - 3\n",
182-
"Length range of TSD for DTM set to: 8 - 9\n",
183-
"Length range of TSD for DTT set to: 2 - 2\n",
184-
"Length range of TSD for Helitron set to: 0\n",
185-
"Length range of TSD for MITE set to: 2 - 10\n",
186-
"Length range of TSD for else set to: 0\n",
187-
"\n",
188-
"\n",
189-
"## Extracting the length of each TE family ##\n",
190-
"\n",
191-
"\n",
192-
"## Setting the proportion of fragmented TE loci of each TE family ##\n",
193-
"Maximum chance of keeping a TE insertion intact as 100% integrity in each TE family: 1e-05\n",
194-
"Minimum chance of keeping a TE insertion intact in each TE family: 0\n",
195-
"Maximum proportion of fragmented TE loci of each TE family: 100\n",
196-
"Minimum proportion of fragmented TE loci of each TE family: 99.99999\n",
197-
"\n",
198-
"\n",
199-
"## Setting the proportion of nested TE insertion of each Copia or Gypsy family ##\n",
200-
"Maximum proportion of nested TE insertion of each Copia or Gypsy family set by default: 30\n",
201-
"Minimum proportion of nested TE insertion of each Copia or Gypsy family set by default: 0\n",
202-
"\n",
203-
"\n",
204-
"## Printing the TE library table ##\n",
205-
"Generated the TE library table for simulation. File saved as /powerplant/workspace/cflthc/script/calisim-examples-workshop-material/workshop/tegenomesimulator/demo_m0/TEgenomeSimulator_demo_m0_1_5_result/TElib_sim_list.table\n",
206-
"\n",
207-
"\n",
208-
"\n",
209-
"TE library table generated successfully. Output logged to /powerplant/workspace/cflthc/script/calisim-examples-workshop-material/workshop/tegenomesimulator/demo_m0/TEgenomeSimulator_demo_m0_1_5_result/TEgenomeSimulator.log\n",
210-
"mode=0, running prep_yml_config.py for Random Genome Mode.\n",
211-
"\n",
212-
"\n",
213-
"#############################################\n",
214-
"### Prepare TEgenomeSimulator config file ###\n",
215-
"#############################################\n",
216-
"Using genome fasta file None\n",
217-
"Using repeat fasta file ../data/combined_curated_TE_lib_ATOSZM_selected.fasta\n",
218-
"Output directory set as /powerplant/workspace/cflthc/script/calisim-examples-workshop-material/workshop/tegenomesimulator/demo_m0/TEgenomeSimulator_demo_m0_1_5_result\n",
219-
"Generated the config file for simulation. File saved as /powerplant/workspace/cflthc/script/calisim-examples-workshop-material/workshop/tegenomesimulator/demo_m0/TEgenomeSimulator_demo_m0_1_5_result/TEgenomeSimulator_demo_m0_1_5.yml\n",
220-
"\n",
221-
"\n",
222-
"\n",
223-
"Config file generated successfully. Output logged to /powerplant/workspace/cflthc/script/calisim-examples-workshop-material/workshop/tegenomesimulator/demo_m0/TEgenomeSimulator_demo_m0_1_5_result/TEgenomeSimulator.log\n",
224-
"\n",
225-
"\n",
226-
"##############################################################\n",
227-
"### Mutate TE sequence and perform non-overlap TE insertion###\n",
228-
"##############################################################\n",
229-
"Using mode 0 (0 for random genome; 1 for custome genome)\n",
230-
"Using config file TEgenomeSimulator_demo_m0_1_5.yml\n",
231-
"\n",
232-
"Genome with non-overlap random TE insertions was generated successfully. Output logged to /powerplant/workspace/cflthc/script/calisim-examples-workshop-material/workshop/tegenomesimulator/demo_m0/TEgenomeSimulator_demo_m0_1_5_result/TEgenomeSimulator.log\n",
233-
"\n",
234-
"\n",
235-
"##############################################################\n",
236-
"### Mutate TE sequence and perform non-overlap TE insertion###\n",
237-
"##############################################################\n",
238-
"Using mode 0 (0 for random genome; 1 for custome genome)\n",
239-
"Using config file TEgenomeSimulator_demo_m0_1_5.yml\n",
240-
"\n",
241-
"Genome with non-overlap random and nested TE insertions was generated successfully. Output logged to /powerplant/workspace/cflthc/script/calisim-examples-workshop-material/workshop/tegenomesimulator/demo_m0/TEgenomeSimulator_demo_m0_1_5_result/TEgenomeSimulator.log\n"
242-
]
243-
}
244-
],
118+
"outputs": [],
245119
"source": [
120+
"%%bash\n",
246121
"cd ..\n",
247122
"mkdir demo_m0\n",
248123
"cd demo_m0\n",
@@ -254,8 +129,7 @@
254129
"outdir=\".\"\n",
255130
"prefix=demo_m0_${mincn}_${maxcn}\n",
256131
"\n",
257-
"#/home/cflthc/.local/bin/tegenomesimulator -M 0 -p $prefix -c $chridx -r $repeat -m $maxcn -n $mincn -i $intact -o $outdir\n",
258-
"python3 /workspace/cflthc/script/TEgenomeSimulator/TEgenomeSimulator/TEgenomeSimulator.py -M 0 -p $prefix -c $chridx -r $repeat -m $maxcn -n $mincn -i $intact -o $outdir"
132+
"tegenomesimulator -M 0 -p $prefix -c $chridx -r $repeat -m $maxcn -n $mincn -i $intact -o $outdir"
259133
]
260134
},
261135
{
@@ -268,50 +142,25 @@
268142
},
269143
{
270144
"cell_type": "code",
271-
"execution_count": 14,
145+
"execution_count": null,
272146
"id": "0558d829-9e2b-44fc-81c6-7c153c302afa",
273-
"metadata": {},
274-
"outputs": [
275-
{
276-
"name": "stdout",
277-
"output_type": "stream",
278-
"text": [
279-
"total 64\n",
280-
"drwxr-sr-x. 3 cflthc powerplant 384 Jul 31 21:05 report\n",
281-
"drwxr-sr-x. 2 cflthc powerplant 606 Jul 31 19:34 TEgenomeSimulator_demo_m0_1_5_result\n"
282-
]
283-
}
284-
],
147+
"metadata": {
148+
"scrolled": true
149+
},
150+
"outputs": [],
285151
"source": [
152+
"%%bash\n",
286153
"ls -l"
287154
]
288155
},
289156
{
290157
"cell_type": "code",
291-
"execution_count": 15,
158+
"execution_count": null,
292159
"id": "6856354c-c45a-4595-ba63-cbed123cfc3b",
293160
"metadata": {},
294-
"outputs": [
295-
{
296-
"name": "stdout",
297-
"output_type": "stream",
298-
"text": [
299-
"total 28720\n",
300-
"-rw-r--r--. 1 cflthc powerplant 5683983 Jul 31 21:26 demo_m0_1_5_genome_sequence_out.fasta\n",
301-
"-rw-r--r--. 1 cflthc powerplant 5897899 Jul 31 21:26 demo_m0_1_5_genome_sequence_out_final.fasta\n",
302-
"-rw-r--r--. 1 cflthc powerplant 68 Jul 31 21:05 demo_m0_1_5_genome_sequence_out_final.fasta.fai\n",
303-
"-rw-r--r--. 1 cflthc powerplant 590397 Jul 31 21:26 demo_m0_1_5_repeat_annotation_out_final.gff\n",
304-
"-rw-r--r--. 1 cflthc powerplant 506440 Jul 31 21:26 demo_m0_1_5_repeat_annotation_out.gff\n",
305-
"-rw-r--r--. 1 cflthc powerplant 5685938 Jul 31 21:26 demo_m0_1_5_repeat_sequence_out.fasta\n",
306-
"-rw-r--r--. 1 cflthc powerplant 5930191 Jul 31 21:26 demo_m0_1_5_repeat_sequence_out_final.fasta\n",
307-
"-rw-r--r--. 1 cflthc powerplant 220183 Jul 31 21:05 demo_m0_1_5_repeat_sequence_out_final.fasta.fai\n",
308-
"-rw-r--r--. 1 cflthc powerplant 417 Jul 31 21:26 TEgenomeSimulator_demo_m0_1_5.yml\n",
309-
"-rw-r--r--. 1 cflthc powerplant 5461 Jul 31 21:26 TEgenomeSimulator.log\n",
310-
"-rw-r--r--. 1 cflthc powerplant 130556 Jul 31 21:26 TElib_sim_list.table\n"
311-
]
312-
}
313-
],
161+
"outputs": [],
314162
"source": [
163+
"%%bash\n",
315164
"ls -l TEgenomeSimulator_demo_m0_1_5_result"
316165
]
317166
},
@@ -391,120 +240,27 @@
391240
}
392241
],
393242
"source": [
394-
"/home/cflthc/.local/bin/tegenomesimulator --help"
395-
]
396-
},
397-
{
398-
"cell_type": "markdown",
399-
"id": "683fd8d0-7389-46c0-9576-308a5dd4eeb5",
400-
"metadata": {},
401-
"source": [
402-
"## Visualisation\n",
403-
"Rmd file with input parameters"
404-
]
405-
},
406-
{
407-
"cell_type": "code",
408-
"execution_count": 9,
409-
"id": "b879f845-e035-40da-aba6-1c503f8aeaa1",
410-
"metadata": {},
411-
"outputs": [
412-
{
413-
"name": "stdout",
414-
"output_type": "stream",
415-
"text": [
416-
"Loading \u001b[1mR/4.3.3\u001b[22m\n",
417-
" \u001b[94mLoading requirement\u001b[0m: unixODBC/2.3.12 JAGS/4.3.2 texlive/20230914\n"
418-
]
419-
}
420-
],
421-
"source": [
422-
"module load samtools\n",
423-
"module load R"
424-
]
425-
},
426-
{
427-
"cell_type": "code",
428-
"execution_count": 10,
429-
"id": "bd0be183-6002-489a-a020-38ac74052fff",
430-
"metadata": {},
431-
"outputs": [
432-
{
433-
"name": "stdout",
434-
"output_type": "stream",
435-
"text": [
436-
"/powerplant/workspace/cflthc/script/calisim-examples-workshop-material/workshop/tegenomesimulator/demo_m0\n"
437-
]
438-
}
439-
],
440-
"source": [
441-
"pwd"
442-
]
443-
},
444-
{
445-
"cell_type": "code",
446-
"execution_count": 19,
447-
"id": "1704c7c4-5a2b-45c3-ab67-cde40bdcfb8e",
448-
"metadata": {},
449-
"outputs": [],
450-
"source": [
451-
"# need to use full path of input files and directory to create the report\n",
452-
"demo_dir=$(pwd)\n",
453-
"genome_fa=$demo_dir/TEgenomeSimulator_demo_m0_1_5_result/demo_m0_1_5_genome_sequence_out_final.fasta\n",
454-
"repeat_fa=$demo_dir/TEgenomeSimulator_demo_m0_1_5_result/demo_m0_1_5_repeat_sequence_out_final.fasta\n",
455-
"repeat_gff=$demo_dir/TEgenomeSimulator_demo_m0_1_5_result/demo_m0_1_5_repeat_annotation_out_final.gff\n",
456-
"prefix=\"demo_m0\"\n",
457-
"outdir=$demo_dir/report\n",
458-
"mkdir -p $outdir"
459-
]
460-
},
461-
{
462-
"cell_type": "code",
463-
"execution_count": 20,
464-
"id": "feefc865-9c15-4655-951b-3c82cd85c6a3",
465-
"metadata": {},
466-
"outputs": [
467-
{
468-
"name": "stdout",
469-
"output_type": "stream",
470-
"text": [
471-
"\n",
472-
"\n",
473-
"processing file: summarise_demo_m0.Rmd\n",
474-
" \n",
475-
"output file: summarise_demo_m0.knit.md\n",
476-
"\n",
477-
"/software/pandoc/3.2/bin/pandoc +RTS -K512m -RTS summarise_demo_m0.knit.md --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output /powerplant/workspace/cflthc/script/calisim-examples-workshop-material/workshop/tegenomesimulator/demo_m0/report/tegenomesimulator_report_demo_m0.html --lua-filter /software/statistical/R-4.3.3/lib64/R/library/rmarkdown/rmarkdown/lua/pagebreak.lua --lua-filter /software/statistical/R-4.3.3/lib64/R/library/rmarkdown/rmarkdown/lua/latex-div.lua --embed-resources --standalone --variable bs3=TRUE --section-divs --table-of-contents --toc-depth 2 --template /software/statistical/R-4.3.3/lib64/R/library/rmarkdown/rmd/h/default.html --no-highlight --variable highlightjs=1 --variable theme=bootstrap --mathjax --variable 'mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML' --include-in-header /tmp/Rtmpk98MB0/rmarkdown-str17a2f4450fdf62.html \n",
478-
"\n",
479-
"Output created: report/tegenomesimulator_report_demo_m0.html\n",
480-
"\u001b[?25h\u001b[?25h\n"
481-
]
482-
}
483-
],
484-
"source": [
485-
"Rscript ../run_tegs_report.R ${genome_fa} ${repeat_fa} ${repeat_gff} ${prefix} ${outdir}"
243+
"tegenomesimulator --help"
486244
]
487-
},
488-
{
489-
"cell_type": "code",
490-
"execution_count": null,
491-
"id": "635dc66a-0753-4938-9ff6-811ed567a26f",
492-
"metadata": {},
493-
"outputs": [],
494-
"source": []
495245
}
496246
],
497247
"metadata": {
498248
"kernelspec": {
499-
"display_name": "Bash",
500-
"language": "bash",
501-
"name": "bash"
249+
"display_name": "Python 3 (ipykernel)",
250+
"language": "python",
251+
"name": "python3"
502252
},
503253
"language_info": {
504-
"codemirror_mode": "shell",
505-
"file_extension": ".sh",
506-
"mimetype": "text/x-sh",
507-
"name": "bash"
254+
"codemirror_mode": {
255+
"name": "ipython",
256+
"version": 3
257+
},
258+
"file_extension": ".py",
259+
"mimetype": "text/x-python",
260+
"name": "python",
261+
"nbconvert_exporter": "python",
262+
"pygments_lexer": "ipython3",
263+
"version": "3.9.20"
508264
}
509265
},
510266
"nbformat": 4,

0 commit comments

Comments
 (0)