mirror of
				https://github.com/johrpan/geposan.git
				synced 2025-10-26 10:47:25 +01:00 
			
		
		
		
	Initial commit
This commit is contained in:
		
						commit
						c52d42c2b6
					
				
					 24 changed files with 1350 additions and 0 deletions
				
			
		
							
								
								
									
										4
									
								
								.Rbuildignore
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										4
									
								
								.Rbuildignore
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,4 @@ | |||
| ^geposan\.Rproj$ | ||||
| ^\.Rproj\.user$ | ||||
| ^LICENSE\.md$ | ||||
| ^scripts$ | ||||
							
								
								
									
										1
									
								
								.gitignore
									
										
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
										
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1 @@ | |||
| .Rproj.user | ||||
							
								
								
									
										30
									
								
								DESCRIPTION
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										30
									
								
								DESCRIPTION
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,30 @@ | |||
| Package: geposan | ||||
| Title: Gene Position Analysis | ||||
| Version: 0.0.0.9000 | ||||
| Authors@R: | ||||
|     person( | ||||
|         "Elias", | ||||
|         "Projahn", | ||||
|         email = "elias@johrpan.de", | ||||
|         role = c("aut", "cre"), | ||||
|     ) | ||||
| Description: Analyze genes based on their position across species. This package | ||||
|     includes gene data from Ensembl. It provides multiple methods to use that | ||||
|     data to find genes that score well in comparison with a set of reference | ||||
|     genes. | ||||
| License: GPL (>= 3) | ||||
| Encoding: UTF-8 | ||||
| LazyData: true | ||||
| LazyDataCompression: xz | ||||
| Roxygen: list(markdown = TRUE) | ||||
| RoxygenNote: 7.1.2 | ||||
| Depends: | ||||
|     R (>= 2.10) | ||||
| Imports: | ||||
|     data.table, | ||||
|     neuralnet | ||||
| Suggests: | ||||
|     biomaRt, | ||||
|     rlog, | ||||
|     stringr, | ||||
|     usethis | ||||
							
								
								
									
										595
									
								
								LICENSE.md
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										595
									
								
								LICENSE.md
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,595 @@ | |||
| GNU General Public License | ||||
| ========================== | ||||
| 
 | ||||
| _Version 3, 29 June 2007_   | ||||
| _Copyright © 2007 Free Software Foundation, Inc. <<http://fsf.org/>>_ | ||||
| 
 | ||||
| Everyone is permitted to copy and distribute verbatim copies of this license | ||||
| document, but changing it is not allowed. | ||||
| 
 | ||||
| ## Preamble | ||||
| 
 | ||||
| The GNU General Public License is a free, copyleft license for software and other | ||||
| kinds of works. | ||||
| 
 | ||||
| The licenses for most software and other practical works are designed to take away | ||||
| your freedom to share and change the works. By contrast, the GNU General Public | ||||
| License is intended to guarantee your freedom to share and change all versions of a | ||||
| program--to make sure it remains free software for all its users. We, the Free | ||||
| Software Foundation, use the GNU General Public License for most of our software; it | ||||
| applies also to any other work released this way by its authors. You can apply it to | ||||
| your programs, too. | ||||
| 
 | ||||
| When we speak of free software, we are referring to freedom, not price. Our General | ||||
| Public Licenses are designed to make sure that you have the freedom to distribute | ||||
| copies of free software (and charge for them if you wish), that you receive source | ||||
| code or can get it if you want it, that you can change the software or use pieces of | ||||
| it in new free programs, and that you know you can do these things. | ||||
| 
 | ||||
| To protect your rights, we need to prevent others from denying you these rights or | ||||
| asking you to surrender the rights. Therefore, you have certain responsibilities if | ||||
| you distribute copies of the software, or if you modify it: responsibilities to | ||||
| respect the freedom of others. | ||||
| 
 | ||||
| For example, if you distribute copies of such a program, whether gratis or for a fee, | ||||
| you must pass on to the recipients the same freedoms that you received. You must make | ||||
| sure that they, too, receive or can get the source code. And you must show them these | ||||
| terms so they know their rights. | ||||
| 
 | ||||
| Developers that use the GNU GPL protect your rights with two steps: **(1)** assert | ||||
| copyright on the software, and **(2)** offer you this License giving you legal permission | ||||
| to copy, distribute and/or modify it. | ||||
| 
 | ||||
| For the developers' and authors' protection, the GPL clearly explains that there is | ||||
| no warranty for this free software. For both users' and authors' sake, the GPL | ||||
| requires that modified versions be marked as changed, so that their problems will not | ||||
| be attributed erroneously to authors of previous versions. | ||||
| 
 | ||||
| Some devices are designed to deny users access to install or run modified versions of | ||||
| the software inside them, although the manufacturer can do so. This is fundamentally | ||||
| incompatible with the aim of protecting users' freedom to change the software. The | ||||
| systematic pattern of such abuse occurs in the area of products for individuals to | ||||
| use, which is precisely where it is most unacceptable. Therefore, we have designed | ||||
| this version of the GPL to prohibit the practice for those products. If such problems | ||||
| arise substantially in other domains, we stand ready to extend this provision to | ||||
| those domains in future versions of the GPL, as needed to protect the freedom of | ||||
| users. | ||||
| 
 | ||||
| Finally, every program is threatened constantly by software patents. States should | ||||
| not allow patents to restrict development and use of software on general-purpose | ||||
| computers, but in those that do, we wish to avoid the special danger that patents | ||||
| applied to a free program could make it effectively proprietary. To prevent this, the | ||||
| GPL assures that patents cannot be used to render the program non-free. | ||||
| 
 | ||||
| The precise terms and conditions for copying, distribution and modification follow. | ||||
| 
 | ||||
| ## TERMS AND CONDITIONS | ||||
| 
 | ||||
| ### 0. Definitions | ||||
| 
 | ||||
| “This License” refers to version 3 of the GNU General Public License. | ||||
| 
 | ||||
| “Copyright” also means copyright-like laws that apply to other kinds of | ||||
| works, such as semiconductor masks. | ||||
| 
 | ||||
| “The Program” refers to any copyrightable work licensed under this | ||||
| License. Each licensee is addressed as “you”. “Licensees” and | ||||
| “recipients” may be individuals or organizations. | ||||
| 
 | ||||
| To “modify” a work means to copy from or adapt all or part of the work in | ||||
| a fashion requiring copyright permission, other than the making of an exact copy. The | ||||
| resulting work is called a “modified version” of the earlier work or a | ||||
| work “based on” the earlier work. | ||||
| 
 | ||||
| A “covered work” means either the unmodified Program or a work based on | ||||
| the Program. | ||||
| 
 | ||||
| To “propagate” a work means to do anything with it that, without | ||||
| permission, would make you directly or secondarily liable for infringement under | ||||
| applicable copyright law, except executing it on a computer or modifying a private | ||||
| copy. Propagation includes copying, distribution (with or without modification), | ||||
| making available to the public, and in some countries other activities as well. | ||||
| 
 | ||||
| To “convey” a work means any kind of propagation that enables other | ||||
| parties to make or receive copies. Mere interaction with a user through a computer | ||||
| network, with no transfer of a copy, is not conveying. | ||||
| 
 | ||||
| An interactive user interface displays “Appropriate Legal Notices” to the | ||||
| extent that it includes a convenient and prominently visible feature that **(1)** | ||||
| displays an appropriate copyright notice, and **(2)** tells the user that there is no | ||||
| warranty for the work (except to the extent that warranties are provided), that | ||||
| licensees may convey the work under this License, and how to view a copy of this | ||||
| License. If the interface presents a list of user commands or options, such as a | ||||
| menu, a prominent item in the list meets this criterion. | ||||
| 
 | ||||
| ### 1. Source Code | ||||
| 
 | ||||
| The “source code” for a work means the preferred form of the work for | ||||
| making modifications to it. “Object code” means any non-source form of a | ||||
| work. | ||||
| 
 | ||||
| A “Standard Interface” means an interface that either is an official | ||||
| standard defined by a recognized standards body, or, in the case of interfaces | ||||
| specified for a particular programming language, one that is widely used among | ||||
| developers working in that language. | ||||
| 
 | ||||
| The “System Libraries” of an executable work include anything, other than | ||||
| the work as a whole, that **(a)** is included in the normal form of packaging a Major | ||||
| Component, but which is not part of that Major Component, and **(b)** serves only to | ||||
| enable use of the work with that Major Component, or to implement a Standard | ||||
| Interface for which an implementation is available to the public in source code form. | ||||
| A “Major Component”, in this context, means a major essential component | ||||
| (kernel, window system, and so on) of the specific operating system (if any) on which | ||||
| the executable work runs, or a compiler used to produce the work, or an object code | ||||
| interpreter used to run it. | ||||
| 
 | ||||
| The “Corresponding Source” for a work in object code form means all the | ||||
| source code needed to generate, install, and (for an executable work) run the object | ||||
| code and to modify the work, including scripts to control those activities. However, | ||||
| it does not include the work's System Libraries, or general-purpose tools or | ||||
| generally available free programs which are used unmodified in performing those | ||||
| activities but which are not part of the work. For example, Corresponding Source | ||||
| includes interface definition files associated with source files for the work, and | ||||
| the source code for shared libraries and dynamically linked subprograms that the work | ||||
| is specifically designed to require, such as by intimate data communication or | ||||
| control flow between those subprograms and other parts of the work. | ||||
| 
 | ||||
| The Corresponding Source need not include anything that users can regenerate | ||||
| automatically from other parts of the Corresponding Source. | ||||
| 
 | ||||
| The Corresponding Source for a work in source code form is that same work. | ||||
| 
 | ||||
| ### 2. Basic Permissions | ||||
| 
 | ||||
| All rights granted under this License are granted for the term of copyright on the | ||||
| Program, and are irrevocable provided the stated conditions are met. This License | ||||
| explicitly affirms your unlimited permission to run the unmodified Program. The | ||||
| output from running a covered work is covered by this License only if the output, | ||||
| given its content, constitutes a covered work. This License acknowledges your rights | ||||
| of fair use or other equivalent, as provided by copyright law. | ||||
| 
 | ||||
| You may make, run and propagate covered works that you do not convey, without | ||||
| conditions so long as your license otherwise remains in force. You may convey covered | ||||
| works to others for the sole purpose of having them make modifications exclusively | ||||
| for you, or provide you with facilities for running those works, provided that you | ||||
| comply with the terms of this License in conveying all material for which you do not | ||||
| control copyright. Those thus making or running the covered works for you must do so | ||||
| exclusively on your behalf, under your direction and control, on terms that prohibit | ||||
| them from making any copies of your copyrighted material outside their relationship | ||||
| with you. | ||||
| 
 | ||||
| Conveying under any other circumstances is permitted solely under the conditions | ||||
| stated below. Sublicensing is not allowed; section 10 makes it unnecessary. | ||||
| 
 | ||||
| ### 3. Protecting Users' Legal Rights From Anti-Circumvention Law | ||||
| 
 | ||||
| No covered work shall be deemed part of an effective technological measure under any | ||||
| applicable law fulfilling obligations under article 11 of the WIPO copyright treaty | ||||
| adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention | ||||
| of such measures. | ||||
| 
 | ||||
| When you convey a covered work, you waive any legal power to forbid circumvention of | ||||
| technological measures to the extent such circumvention is effected by exercising | ||||
| rights under this License with respect to the covered work, and you disclaim any | ||||
| intention to limit operation or modification of the work as a means of enforcing, | ||||
| against the work's users, your or third parties' legal rights to forbid circumvention | ||||
| of technological measures. | ||||
| 
 | ||||
| ### 4. Conveying Verbatim Copies | ||||
| 
 | ||||
| You may convey verbatim copies of the Program's source code as you receive it, in any | ||||
| medium, provided that you conspicuously and appropriately publish on each copy an | ||||
| appropriate copyright notice; keep intact all notices stating that this License and | ||||
| any non-permissive terms added in accord with section 7 apply to the code; keep | ||||
| intact all notices of the absence of any warranty; and give all recipients a copy of | ||||
| this License along with the Program. | ||||
| 
 | ||||
| You may charge any price or no price for each copy that you convey, and you may offer | ||||
| support or warranty protection for a fee. | ||||
| 
 | ||||
| ### 5. Conveying Modified Source Versions | ||||
| 
 | ||||
| You may convey a work based on the Program, or the modifications to produce it from | ||||
| the Program, in the form of source code under the terms of section 4, provided that | ||||
| you also meet all of these conditions: | ||||
| 
 | ||||
| * **a)** The work must carry prominent notices stating that you modified it, and giving a | ||||
| relevant date. | ||||
| * **b)** The work must carry prominent notices stating that it is released under this | ||||
| License and any conditions added under section 7. This requirement modifies the | ||||
| requirement in section 4 to “keep intact all notices”. | ||||
| * **c)** You must license the entire work, as a whole, under this License to anyone who | ||||
| comes into possession of a copy. This License will therefore apply, along with any | ||||
| applicable section 7 additional terms, to the whole of the work, and all its parts, | ||||
| regardless of how they are packaged. This License gives no permission to license the | ||||
| work in any other way, but it does not invalidate such permission if you have | ||||
| separately received it. | ||||
| * **d)** If the work has interactive user interfaces, each must display Appropriate Legal | ||||
| Notices; however, if the Program has interactive interfaces that do not display | ||||
| Appropriate Legal Notices, your work need not make them do so. | ||||
| 
 | ||||
| A compilation of a covered work with other separate and independent works, which are | ||||
| not by their nature extensions of the covered work, and which are not combined with | ||||
| it such as to form a larger program, in or on a volume of a storage or distribution | ||||
| medium, is called an “aggregate” if the compilation and its resulting | ||||
| copyright are not used to limit the access or legal rights of the compilation's users | ||||
| beyond what the individual works permit. Inclusion of a covered work in an aggregate | ||||
| does not cause this License to apply to the other parts of the aggregate. | ||||
| 
 | ||||
| ### 6. Conveying Non-Source Forms | ||||
| 
 | ||||
| You may convey a covered work in object code form under the terms of sections 4 and | ||||
| 5, provided that you also convey the machine-readable Corresponding Source under the | ||||
| terms of this License, in one of these ways: | ||||
| 
 | ||||
| * **a)** Convey the object code in, or embodied in, a physical product (including a | ||||
| physical distribution medium), accompanied by the Corresponding Source fixed on a | ||||
| durable physical medium customarily used for software interchange. | ||||
| * **b)** Convey the object code in, or embodied in, a physical product (including a | ||||
| physical distribution medium), accompanied by a written offer, valid for at least | ||||
| three years and valid for as long as you offer spare parts or customer support for | ||||
| that product model, to give anyone who possesses the object code either **(1)** a copy of | ||||
| the Corresponding Source for all the software in the product that is covered by this | ||||
| License, on a durable physical medium customarily used for software interchange, for | ||||
| a price no more than your reasonable cost of physically performing this conveying of | ||||
| source, or **(2)** access to copy the Corresponding Source from a network server at no | ||||
| charge. | ||||
| * **c)** Convey individual copies of the object code with a copy of the written offer to | ||||
| provide the Corresponding Source. This alternative is allowed only occasionally and | ||||
| noncommercially, and only if you received the object code with such an offer, in | ||||
| accord with subsection 6b. | ||||
| * **d)** Convey the object code by offering access from a designated place (gratis or for | ||||
| a charge), and offer equivalent access to the Corresponding Source in the same way | ||||
| through the same place at no further charge. You need not require recipients to copy | ||||
| the Corresponding Source along with the object code. If the place to copy the object | ||||
| code is a network server, the Corresponding Source may be on a different server | ||||
| (operated by you or a third party) that supports equivalent copying facilities, | ||||
| provided you maintain clear directions next to the object code saying where to find | ||||
| the Corresponding Source. Regardless of what server hosts the Corresponding Source, | ||||
| you remain obligated to ensure that it is available for as long as needed to satisfy | ||||
| these requirements. | ||||
| * **e)** Convey the object code using peer-to-peer transmission, provided you inform | ||||
| other peers where the object code and Corresponding Source of the work are being | ||||
| offered to the general public at no charge under subsection 6d. | ||||
| 
 | ||||
| A separable portion of the object code, whose source code is excluded from the | ||||
| Corresponding Source as a System Library, need not be included in conveying the | ||||
| object code work. | ||||
| 
 | ||||
| A “User Product” is either **(1)** a “consumer product”, which | ||||
| means any tangible personal property which is normally used for personal, family, or | ||||
| household purposes, or **(2)** anything designed or sold for incorporation into a | ||||
| dwelling. In determining whether a product is a consumer product, doubtful cases | ||||
| shall be resolved in favor of coverage. For a particular product received by a | ||||
| particular user, “normally used” refers to a typical or common use of | ||||
| that class of product, regardless of the status of the particular user or of the way | ||||
| in which the particular user actually uses, or expects or is expected to use, the | ||||
| product. A product is a consumer product regardless of whether the product has | ||||
| substantial commercial, industrial or non-consumer uses, unless such uses represent | ||||
| the only significant mode of use of the product. | ||||
| 
 | ||||
| “Installation Information” for a User Product means any methods, | ||||
| procedures, authorization keys, or other information required to install and execute | ||||
| modified versions of a covered work in that User Product from a modified version of | ||||
| its Corresponding Source. The information must suffice to ensure that the continued | ||||
| functioning of the modified object code is in no case prevented or interfered with | ||||
| solely because modification has been made. | ||||
| 
 | ||||
| If you convey an object code work under this section in, or with, or specifically for | ||||
| use in, a User Product, and the conveying occurs as part of a transaction in which | ||||
| the right of possession and use of the User Product is transferred to the recipient | ||||
| in perpetuity or for a fixed term (regardless of how the transaction is | ||||
| characterized), the Corresponding Source conveyed under this section must be | ||||
| accompanied by the Installation Information. But this requirement does not apply if | ||||
| neither you nor any third party retains the ability to install modified object code | ||||
| on the User Product (for example, the work has been installed in ROM). | ||||
| 
 | ||||
| The requirement to provide Installation Information does not include a requirement to | ||||
| continue to provide support service, warranty, or updates for a work that has been | ||||
| modified or installed by the recipient, or for the User Product in which it has been | ||||
| modified or installed. Access to a network may be denied when the modification itself | ||||
| materially and adversely affects the operation of the network or violates the rules | ||||
| and protocols for communication across the network. | ||||
| 
 | ||||
| Corresponding Source conveyed, and Installation Information provided, in accord with | ||||
| this section must be in a format that is publicly documented (and with an | ||||
| implementation available to the public in source code form), and must require no | ||||
| special password or key for unpacking, reading or copying. | ||||
| 
 | ||||
| ### 7. Additional Terms | ||||
| 
 | ||||
| “Additional permissions” are terms that supplement the terms of this | ||||
| License by making exceptions from one or more of its conditions. Additional | ||||
| permissions that are applicable to the entire Program shall be treated as though they | ||||
| were included in this License, to the extent that they are valid under applicable | ||||
| law. If additional permissions apply only to part of the Program, that part may be | ||||
| used separately under those permissions, but the entire Program remains governed by | ||||
| this License without regard to the additional permissions. | ||||
| 
 | ||||
| When you convey a copy of a covered work, you may at your option remove any | ||||
| additional permissions from that copy, or from any part of it. (Additional | ||||
| permissions may be written to require their own removal in certain cases when you | ||||
| modify the work.) You may place additional permissions on material, added by you to a | ||||
| covered work, for which you have or can give appropriate copyright permission. | ||||
| 
 | ||||
| Notwithstanding any other provision of this License, for material you add to a | ||||
| covered work, you may (if authorized by the copyright holders of that material) | ||||
| supplement the terms of this License with terms: | ||||
| 
 | ||||
| * **a)** Disclaiming warranty or limiting liability differently from the terms of | ||||
| sections 15 and 16 of this License; or | ||||
| * **b)** Requiring preservation of specified reasonable legal notices or author | ||||
| attributions in that material or in the Appropriate Legal Notices displayed by works | ||||
| containing it; or | ||||
| * **c)** Prohibiting misrepresentation of the origin of that material, or requiring that | ||||
| modified versions of such material be marked in reasonable ways as different from the | ||||
| original version; or | ||||
| * **d)** Limiting the use for publicity purposes of names of licensors or authors of the | ||||
| material; or | ||||
| * **e)** Declining to grant rights under trademark law for use of some trade names, | ||||
| trademarks, or service marks; or | ||||
| * **f)** Requiring indemnification of licensors and authors of that material by anyone | ||||
| who conveys the material (or modified versions of it) with contractual assumptions of | ||||
| liability to the recipient, for any liability that these contractual assumptions | ||||
| directly impose on those licensors and authors. | ||||
| 
 | ||||
| All other non-permissive additional terms are considered “further | ||||
| restrictions” within the meaning of section 10. If the Program as you received | ||||
| it, or any part of it, contains a notice stating that it is governed by this License | ||||
| along with a term that is a further restriction, you may remove that term. If a | ||||
| license document contains a further restriction but permits relicensing or conveying | ||||
| under this License, you may add to a covered work material governed by the terms of | ||||
| that license document, provided that the further restriction does not survive such | ||||
| relicensing or conveying. | ||||
| 
 | ||||
| If you add terms to a covered work in accord with this section, you must place, in | ||||
| the relevant source files, a statement of the additional terms that apply to those | ||||
| files, or a notice indicating where to find the applicable terms. | ||||
| 
 | ||||
| Additional terms, permissive or non-permissive, may be stated in the form of a | ||||
| separately written license, or stated as exceptions; the above requirements apply | ||||
| either way. | ||||
| 
 | ||||
| ### 8. Termination | ||||
| 
 | ||||
| You may not propagate or modify a covered work except as expressly provided under | ||||
| this License. Any attempt otherwise to propagate or modify it is void, and will | ||||
| automatically terminate your rights under this License (including any patent licenses | ||||
| granted under the third paragraph of section 11). | ||||
| 
 | ||||
| However, if you cease all violation of this License, then your license from a | ||||
| particular copyright holder is reinstated **(a)** provisionally, unless and until the | ||||
| copyright holder explicitly and finally terminates your license, and **(b)** permanently, | ||||
| if the copyright holder fails to notify you of the violation by some reasonable means | ||||
| prior to 60 days after the cessation. | ||||
| 
 | ||||
| Moreover, your license from a particular copyright holder is reinstated permanently | ||||
| if the copyright holder notifies you of the violation by some reasonable means, this | ||||
| is the first time you have received notice of violation of this License (for any | ||||
| work) from that copyright holder, and you cure the violation prior to 30 days after | ||||
| your receipt of the notice. | ||||
| 
 | ||||
| Termination of your rights under this section does not terminate the licenses of | ||||
| parties who have received copies or rights from you under this License. If your | ||||
| rights have been terminated and not permanently reinstated, you do not qualify to | ||||
| receive new licenses for the same material under section 10. | ||||
| 
 | ||||
| ### 9. Acceptance Not Required for Having Copies | ||||
| 
 | ||||
| You are not required to accept this License in order to receive or run a copy of the | ||||
| Program. Ancillary propagation of a covered work occurring solely as a consequence of | ||||
| using peer-to-peer transmission to receive a copy likewise does not require | ||||
| acceptance. However, nothing other than this License grants you permission to | ||||
| propagate or modify any covered work. These actions infringe copyright if you do not | ||||
| accept this License. Therefore, by modifying or propagating a covered work, you | ||||
| indicate your acceptance of this License to do so. | ||||
| 
 | ||||
| ### 10. Automatic Licensing of Downstream Recipients | ||||
| 
 | ||||
| Each time you convey a covered work, the recipient automatically receives a license | ||||
| from the original licensors, to run, modify and propagate that work, subject to this | ||||
| License. You are not responsible for enforcing compliance by third parties with this | ||||
| License. | ||||
| 
 | ||||
| An “entity transaction” is a transaction transferring control of an | ||||
| organization, or substantially all assets of one, or subdividing an organization, or | ||||
| merging organizations. If propagation of a covered work results from an entity | ||||
| transaction, each party to that transaction who receives a copy of the work also | ||||
| receives whatever licenses to the work the party's predecessor in interest had or | ||||
| could give under the previous paragraph, plus a right to possession of the | ||||
| Corresponding Source of the work from the predecessor in interest, if the predecessor | ||||
| has it or can get it with reasonable efforts. | ||||
| 
 | ||||
| You may not impose any further restrictions on the exercise of the rights granted or | ||||
| affirmed under this License. For example, you may not impose a license fee, royalty, | ||||
| or other charge for exercise of rights granted under this License, and you may not | ||||
| initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging | ||||
| that any patent claim is infringed by making, using, selling, offering for sale, or | ||||
| importing the Program or any portion of it. | ||||
| 
 | ||||
| ### 11. Patents | ||||
| 
 | ||||
| A “contributor” is a copyright holder who authorizes use under this | ||||
| License of the Program or a work on which the Program is based. The work thus | ||||
| licensed is called the contributor's “contributor version”. | ||||
| 
 | ||||
| A contributor's “essential patent claims” are all patent claims owned or | ||||
| controlled by the contributor, whether already acquired or hereafter acquired, that | ||||
| would be infringed by some manner, permitted by this License, of making, using, or | ||||
| selling its contributor version, but do not include claims that would be infringed | ||||
| only as a consequence of further modification of the contributor version. For | ||||
| purposes of this definition, “control” includes the right to grant patent | ||||
| sublicenses in a manner consistent with the requirements of this License. | ||||
| 
 | ||||
| Each contributor grants you a non-exclusive, worldwide, royalty-free patent license | ||||
| under the contributor's essential patent claims, to make, use, sell, offer for sale, | ||||
| import and otherwise run, modify and propagate the contents of its contributor | ||||
| version. | ||||
| 
 | ||||
| In the following three paragraphs, a “patent license” is any express | ||||
| agreement or commitment, however denominated, not to enforce a patent (such as an | ||||
| express permission to practice a patent or covenant not to sue for patent | ||||
| infringement). To “grant” such a patent license to a party means to make | ||||
| such an agreement or commitment not to enforce a patent against the party. | ||||
| 
 | ||||
| If you convey a covered work, knowingly relying on a patent license, and the | ||||
| Corresponding Source of the work is not available for anyone to copy, free of charge | ||||
| and under the terms of this License, through a publicly available network server or | ||||
| other readily accessible means, then you must either **(1)** cause the Corresponding | ||||
| Source to be so available, or **(2)** arrange to deprive yourself of the benefit of the | ||||
| patent license for this particular work, or **(3)** arrange, in a manner consistent with | ||||
| the requirements of this License, to extend the patent license to downstream | ||||
| recipients. “Knowingly relying” means you have actual knowledge that, but | ||||
| for the patent license, your conveying the covered work in a country, or your | ||||
| recipient's use of the covered work in a country, would infringe one or more | ||||
| identifiable patents in that country that you have reason to believe are valid. | ||||
| 
 | ||||
| If, pursuant to or in connection with a single transaction or arrangement, you | ||||
| convey, or propagate by procuring conveyance of, a covered work, and grant a patent | ||||
| license to some of the parties receiving the covered work authorizing them to use, | ||||
| propagate, modify or convey a specific copy of the covered work, then the patent | ||||
| license you grant is automatically extended to all recipients of the covered work and | ||||
| works based on it. | ||||
| 
 | ||||
| A patent license is “discriminatory” if it does not include within the | ||||
| scope of its coverage, prohibits the exercise of, or is conditioned on the | ||||
| non-exercise of one or more of the rights that are specifically granted under this | ||||
| License. You may not convey a covered work if you are a party to an arrangement with | ||||
| a third party that is in the business of distributing software, under which you make | ||||
| payment to the third party based on the extent of your activity of conveying the | ||||
| work, and under which the third party grants, to any of the parties who would receive | ||||
| the covered work from you, a discriminatory patent license **(a)** in connection with | ||||
| copies of the covered work conveyed by you (or copies made from those copies), or **(b)** | ||||
| primarily for and in connection with specific products or compilations that contain | ||||
| the covered work, unless you entered into that arrangement, or that patent license | ||||
| was granted, prior to 28 March 2007. | ||||
| 
 | ||||
| Nothing in this License shall be construed as excluding or limiting any implied | ||||
| license or other defenses to infringement that may otherwise be available to you | ||||
| under applicable patent law. | ||||
| 
 | ||||
| ### 12. No Surrender of Others' Freedom | ||||
| 
 | ||||
| If conditions are imposed on you (whether by court order, agreement or otherwise) | ||||
| that contradict the conditions of this License, they do not excuse you from the | ||||
| conditions of this License. If you cannot convey a covered work so as to satisfy | ||||
| simultaneously your obligations under this License and any other pertinent | ||||
| obligations, then as a consequence you may not convey it at all. For example, if you | ||||
| agree to terms that obligate you to collect a royalty for further conveying from | ||||
| those to whom you convey the Program, the only way you could satisfy both those terms | ||||
| and this License would be to refrain entirely from conveying the Program. | ||||
| 
 | ||||
| ### 13. Use with the GNU Affero General Public License | ||||
| 
 | ||||
| Notwithstanding any other provision of this License, you have permission to link or | ||||
| combine any covered work with a work licensed under version 3 of the GNU Affero | ||||
| General Public License into a single combined work, and to convey the resulting work. | ||||
| The terms of this License will continue to apply to the part which is the covered | ||||
| work, but the special requirements of the GNU Affero General Public License, section | ||||
| 13, concerning interaction through a network will apply to the combination as such. | ||||
| 
 | ||||
| ### 14. Revised Versions of this License | ||||
| 
 | ||||
| The Free Software Foundation may publish revised and/or new versions of the GNU | ||||
| General Public License from time to time. Such new versions will be similar in spirit | ||||
| to the present version, but may differ in detail to address new problems or concerns. | ||||
| 
 | ||||
| Each version is given a distinguishing version number. If the Program specifies that | ||||
| a certain numbered version of the GNU General Public License “or any later | ||||
| version” applies to it, you have the option of following the terms and | ||||
| conditions either of that numbered version or of any later version published by the | ||||
| Free Software Foundation. If the Program does not specify a version number of the GNU | ||||
| General Public License, you may choose any version ever published by the Free | ||||
| Software Foundation. | ||||
| 
 | ||||
| If the Program specifies that a proxy can decide which future versions of the GNU | ||||
| General Public License can be used, that proxy's public statement of acceptance of a | ||||
| version permanently authorizes you to choose that version for the Program. | ||||
| 
 | ||||
| Later license versions may give you additional or different permissions. However, no | ||||
| additional obligations are imposed on any author or copyright holder as a result of | ||||
| your choosing to follow a later version. | ||||
| 
 | ||||
| ### 15. Disclaimer of Warranty | ||||
| 
 | ||||
| THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. | ||||
| EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES | ||||
| PROVIDE THE PROGRAM “AS IS” WITHOUT WARRANTY OF ANY KIND, EITHER | ||||
| EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF | ||||
| MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE | ||||
| QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE | ||||
| DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. | ||||
| 
 | ||||
| ### 16. Limitation of Liability | ||||
| 
 | ||||
| IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY | ||||
| COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS | ||||
| PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, | ||||
| INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE | ||||
| PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE | ||||
| OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE | ||||
| WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE | ||||
| POSSIBILITY OF SUCH DAMAGES. | ||||
| 
 | ||||
| ### 17. Interpretation of Sections 15 and 16 | ||||
| 
 | ||||
| If the disclaimer of warranty and limitation of liability provided above cannot be | ||||
| given local legal effect according to their terms, reviewing courts shall apply local | ||||
| law that most closely approximates an absolute waiver of all civil liability in | ||||
| connection with the Program, unless a warranty or assumption of liability accompanies | ||||
| a copy of the Program in return for a fee. | ||||
| 
 | ||||
| _END OF TERMS AND CONDITIONS_ | ||||
| 
 | ||||
| ## How to Apply These Terms to Your New Programs | ||||
| 
 | ||||
| If you develop a new program, and you want it to be of the greatest possible use to | ||||
| the public, the best way to achieve this is to make it free software which everyone | ||||
| can redistribute and change under these terms. | ||||
| 
 | ||||
| To do so, attach the following notices to the program. It is safest to attach them | ||||
| to the start of each source file to most effectively state the exclusion of warranty; | ||||
| and each file should have at least the “copyright” line and a pointer to | ||||
| where the full notice is found. | ||||
| 
 | ||||
|     <one line to give the program's name and a brief idea of what it does.> | ||||
|     Copyright (C) <year>  <name of author> | ||||
| 
 | ||||
|     This program is free software: you can redistribute it and/or modify | ||||
|     it under the terms of the GNU General Public License as published by | ||||
|     the Free Software Foundation, either version 3 of the License, or | ||||
|     (at your option) any later version. | ||||
| 
 | ||||
|     This program is distributed in the hope that it will be useful, | ||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|     GNU General Public License for more details. | ||||
| 
 | ||||
|     You should have received a copy of the GNU General Public License | ||||
|     along with this program.  If not, see <http://www.gnu.org/licenses/>. | ||||
| 
 | ||||
| Also add information on how to contact you by electronic and paper mail. | ||||
| 
 | ||||
| If the program does terminal interaction, make it output a short notice like this | ||||
| when it starts in an interactive mode: | ||||
| 
 | ||||
|     <program>  Copyright (C) <year>  <name of author> | ||||
|     This program comes with ABSOLUTELY NO WARRANTY; for details type 'show w'. | ||||
|     This is free software, and you are welcome to redistribute it | ||||
|     under certain conditions; type 'show c' for details. | ||||
| 
 | ||||
| The hypothetical commands `show w` and `show c` should show the appropriate parts of | ||||
| the General Public License. Of course, your program's commands might be different; | ||||
| for a GUI interface, you would use an “about box”. | ||||
| 
 | ||||
| You should also get your employer (if you work as a programmer) or school, if any, to | ||||
| sign a “copyright disclaimer” for the program, if necessary. For more | ||||
| information on this, and how to apply and follow the GNU GPL, see | ||||
| <<http://www.gnu.org/licenses/>>. | ||||
| 
 | ||||
| The GNU General Public License does not permit incorporating your program into | ||||
| proprietary programs. If your program is a subroutine library, you may consider it | ||||
| more useful to permit linking proprietary applications with the library. If this is | ||||
| what you want to do, use the GNU Lesser General Public License instead of this | ||||
| License. But first, please read | ||||
| <<http://www.gnu.org/philosophy/why-not-lgpl.html>>. | ||||
							
								
								
									
										7
									
								
								NAMESPACE
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										7
									
								
								NAMESPACE
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,7 @@ | |||
| # Generated by roxygen2: do not edit by hand | ||||
| 
 | ||||
| export(analyze) | ||||
| export(optimize_weights) | ||||
| export(preset) | ||||
| export(ranking) | ||||
| import(data.table) | ||||
							
								
								
									
										72
									
								
								R/analyze.R
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										72
									
								
								R/analyze.R
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,72 @@ | |||
| #' Create a new preset. | ||||
| #' | ||||
| #' A preset is used to specify which methods and inputs should be used for an | ||||
| #' analysis. Note that the genes to process should normally include the | ||||
| #' reference genes to be able to assess the results later. | ||||
| #' | ||||
| #' Available methods are: | ||||
| #' | ||||
| #'  - `clusteriness` How much the gene distances cluster across species. | ||||
| #'  - `correlation` The mean correlation with the reference genes. | ||||
| #'  - `proximity` Mean proximity to telomeres. | ||||
| #'  - `neural` Assessment by neural network. | ||||
| #' | ||||
| #' @param methods IDs of methods to apply. | ||||
| #' @param species IDs of species to include. | ||||
| #' @param genes IDs of genes to screen. | ||||
| #' @param reference_genes IDs of reference genes to compare to. | ||||
| #' | ||||
| #' @return The preset to use with [analyze()]. | ||||
| #' | ||||
| #' @export | ||||
| preset <- function(methods, species, genes, reference_genes) { | ||||
|     list( | ||||
|         method_ids = methods, | ||||
|         species_ids = species, | ||||
|         gene_ids = genes, | ||||
|         reference_gene_ids = reference_genes | ||||
|     ) | ||||
| } | ||||
| 
 | ||||
| #' Analyze by applying the specified preset. | ||||
| #' | ||||
| #' @param preset The preset to use which can be created using [preset()]. | ||||
| #' | ||||
| #' @return A [data.table] with one row for each gene identified by it's ID | ||||
| #'   (`gene` column). The additional columns contain the resulting scores per | ||||
| #'   method and are named after the method IDs. | ||||
| #' | ||||
| #' @export | ||||
| analyze <- function(preset) { | ||||
|     # Available methods by ID. | ||||
|     # | ||||
|     # A method describes a way to perform a computation on gene distance data | ||||
|     # that results in a single score per gene. The function should accept the | ||||
|     # preset to apply as a single parameter (see [preset()]). | ||||
|     # | ||||
|     # The function should return a [data.table] with the following columns: | ||||
|     # | ||||
|     #  - `gene` Gene ID of the processed gene. | ||||
|     #  - `score` Score for the gene between 0.0 and 1.0. | ||||
|     methods <- list( | ||||
|         "clusteriness" = clusteriness, | ||||
|         "correlation" = correlation, | ||||
|         "proximity" = proximity, | ||||
|         "neural" = neural | ||||
|     ) | ||||
| 
 | ||||
|     results <- data.table(gene = genes$id) | ||||
| 
 | ||||
|     for (method_id in preset$method_ids) { | ||||
|         method_results <- methods[[method_id]](distances, preset) | ||||
|         setnames(method_results, "score", method_id) | ||||
| 
 | ||||
|         results <- merge( | ||||
|             results, | ||||
|             method_results, | ||||
|             by = "gene" | ||||
|         ) | ||||
|     } | ||||
| 
 | ||||
|     results | ||||
| } | ||||
							
								
								
									
										54
									
								
								R/clusteriness.R
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										54
									
								
								R/clusteriness.R
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,54 @@ | |||
| # Perform a cluster analysis. | ||||
| # | ||||
| # This function will cluster the data using `hclust` and `cutree` (with the | ||||
| # specified height). Every cluster with at least two members qualifies for | ||||
| # further analysis. Clusters are then ranked based on their size in relation | ||||
| # to the number of values. The return value is a final score between zero and | ||||
| # one. Lower ranking clusters contribute less to this score. | ||||
| clusteriness_priv <- function(data, height = 1000000) { | ||||
|     n <- length(data) | ||||
| 
 | ||||
|     # Return a score of 0.0 if there is just one or no value at all. | ||||
|     if (n < 2) { | ||||
|         return(0.0) | ||||
|     } | ||||
| 
 | ||||
|     # Cluster the data and compute the cluster sizes. | ||||
| 
 | ||||
|     tree <- stats::hclust(stats::dist(data)) | ||||
|     clusters <- stats::cutree(tree, h = height) | ||||
|     cluster_sizes <- sort(tabulate(clusters), decreasing = TRUE) | ||||
| 
 | ||||
|     # Compute the "clusteriness" score. | ||||
| 
 | ||||
|     score <- 0.0 | ||||
| 
 | ||||
|     for (i in seq_along(cluster_sizes)) { | ||||
|         cluster_size <- cluster_sizes[i] | ||||
| 
 | ||||
|         if (cluster_size >= 2) { | ||||
|             cluster_score <- cluster_size / n | ||||
|             score <- score + cluster_score / i | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     score | ||||
| } | ||||
| 
 | ||||
| # Process genes clustering their distance to telomeres. | ||||
| clusteriness <- function(distances, preset) { | ||||
|     results <- data.table(gene = preset$gene_ids) | ||||
| 
 | ||||
|     # Prefilter the input data by species. | ||||
|     distances <- distances[species %chin% preset$species_ids] | ||||
| 
 | ||||
|     # Add an index for quickly accessing data per gene. | ||||
|     setkey(distances, gene) | ||||
| 
 | ||||
|     # Perform the cluster analysis for one gene. | ||||
|     compute <- function(gene_id) { | ||||
|         clusteriness_priv(distances[gene_id, distance]) | ||||
|     } | ||||
| 
 | ||||
|     results[, score := compute(gene), by = 1:nrow(results)] | ||||
| } | ||||
							
								
								
									
										61
									
								
								R/correlation.R
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										61
									
								
								R/correlation.R
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,61 @@ | |||
| # Compute the mean correlation coefficient comparing gene distances with a set | ||||
| # of reference genes. | ||||
| correlation <- function(distances, preset) { | ||||
|     results <- data.table(gene = preset$gene_ids) | ||||
|     reference_gene_ids <- preset$reference_gene_ids | ||||
|     reference_count <- length(reference_gene_ids) | ||||
| 
 | ||||
|     # Prefilter distances by species. | ||||
|     distances <- distances[species %chin% preset$species_ids] | ||||
| 
 | ||||
|     # Add an index for quickly accessing data per gene. | ||||
|     setkey(distances, gene) | ||||
| 
 | ||||
|     # Prepare the reference genes' data. | ||||
|     reference_distances <- distances[gene %chin% reference_gene_ids] | ||||
| 
 | ||||
|     # Perform the correlation for one gene. | ||||
|     compute <- function(gene_id) { | ||||
|         gene_distances <- distances[gene_id] | ||||
|         gene_species_count <- nrow(gene_distances) | ||||
| 
 | ||||
|         # Return a score of 0.0 if there is just one or no value at all. | ||||
|         if (gene_species_count <= 1) { | ||||
|             return(0.0) | ||||
|         } | ||||
| 
 | ||||
|         # Buffer for the sum of correlation coefficients. | ||||
|         correlation_sum <- 0 | ||||
| 
 | ||||
|         # Correlate with all reference genes but not with the gene itself. | ||||
|         for (reference_gene_id in | ||||
|              reference_gene_ids[reference_gene_ids != gene_id]) { | ||||
|             data <- merge( | ||||
|                 gene_distances, | ||||
|                 reference_distances[reference_gene_id], | ||||
|                 by = "species" | ||||
|             ) | ||||
| 
 | ||||
|             # Skip this reference gene, if there are not enough value pairs. | ||||
|             # This will lessen the final score, because it effectively | ||||
|             # represents a correlation coefficient of 0.0. | ||||
|             if (nrow(data) <= 1) { | ||||
|                 next | ||||
|             } | ||||
| 
 | ||||
|             # Order data by the reference gene's distance to get a monotonic | ||||
|             # relation. | ||||
|             setorder(data, distance.y) | ||||
| 
 | ||||
|             correlation_sum <- correlation_sum + abs(stats::cor( | ||||
|                 data[, distance.x], data[, distance.y], | ||||
|                 method = "spearman" | ||||
|             )) | ||||
|         } | ||||
| 
 | ||||
|         # Compute the score as the mean correlation coefficient. | ||||
|         score <- correlation_sum / reference_count | ||||
|     } | ||||
| 
 | ||||
|     results[, score := compute(gene), by = 1:nrow(results)] | ||||
| } | ||||
							
								
								
									
										35
									
								
								R/data.R
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								R/data.R
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,35 @@ | |||
| #' Information on included species from the Ensembl database. | ||||
| #' | ||||
| #' @format A [data.table] with 91 rows and 2 variables: | ||||
| #' \describe{ | ||||
| #'   \item{id}{Unique species ID} | ||||
| #'   \item{name}{Human readable species name} | ||||
| #' } | ||||
| "species" | ||||
| 
 | ||||
| #' Information on human genes within the Ensembl database. | ||||
| #' | ||||
| #' This includes only genes on the primary suggested assembly of the human | ||||
| #' nuclear DNA. | ||||
| #' | ||||
| #' @format A [data.table] with 60568 rows and 3 variables: | ||||
| #' \describe{ | ||||
| #'   \item{id}{Ensembl gene ID} | ||||
| #'   \item{name}{The gene's HGNC name} | ||||
| #'   \item{chrosome}{The human chromosome the gene is located on} | ||||
| #'   \item{n_species}{Number of known species with the gene.} | ||||
| #' } | ||||
| "genes" | ||||
| 
 | ||||
| #' Information on gene positions across species. | ||||
| #' | ||||
| #' This dataset contains each known value for a gene's distance to the telomeres | ||||
| #' per species. The data is sourced from Ensembl. | ||||
| #' | ||||
| #' @format A [data.table] with 1390730 rows and 3 variables: | ||||
| #' \describe{ | ||||
| #'   \item{species}{Species ID} | ||||
| #'   \item{gene}{Gene ID} | ||||
| #'   \item{distance}{Distance to nearest telomere} | ||||
| #' } | ||||
| "distances" | ||||
							
								
								
									
										96
									
								
								R/neural.R
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										96
									
								
								R/neural.R
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,96 @@ | |||
| # Find genes by training a neural network on reference position data. | ||||
| # | ||||
| # @param seed A seed to get reproducible results. | ||||
| neural <- function(distances, preset, seed = 448077) { | ||||
|     species_ids <- preset$species_ids | ||||
|     reference_gene_ids <- preset$reference_gene_ids | ||||
| 
 | ||||
|     set.seed(seed) | ||||
|     gene_count <- length(preset$gene_ids) | ||||
| 
 | ||||
|     # Prefilter distances by species. | ||||
|     distances <- distances[species %chin% species_ids] | ||||
| 
 | ||||
|     # Input data for the network. This contains the gene ID as an identifier | ||||
|     # as well as the per-species gene distances as input variables. | ||||
|     data <- data.table(gene = preset$gene_ids) | ||||
| 
 | ||||
|     # Buffer to keep track of species included in the computation. Species | ||||
|     # from `species_ids` may be excluded if they don't have enough data. | ||||
|     species_ids_included <- NULL | ||||
| 
 | ||||
|     # Make a column containing distance data for each species. | ||||
|     for (species_id in species_ids) { | ||||
|         species_distances <- distances[species == species_id, .(gene, distance)] | ||||
| 
 | ||||
|         # Only include species with at least 25% known values. | ||||
| 
 | ||||
|         species_distances <- stats::na.omit(species_distances) | ||||
| 
 | ||||
|         if (nrow(species_distances) >= 0.25 * gene_count) { | ||||
|             species_ids_included <- c(species_ids_included, species_id) | ||||
|             data <- merge(data, species_distances, all.x = TRUE) | ||||
| 
 | ||||
|             # Replace missing data with mean values. The neural network can't | ||||
|             # handle NAs in a meaningful way. Choosing extreme values here | ||||
|             # would result in heavily biased results. Therefore, the mean value | ||||
|             # is chosen as a compromise. However, this will of course lessen the | ||||
|             # significance of the results. | ||||
| 
 | ||||
|             mean_distance <- round(species_distances[, mean(distance)]) | ||||
|             data[is.na(distance), distance := mean_distance] | ||||
| 
 | ||||
|             # Name the new column after the species. | ||||
|             setnames(data, "distance", species_id) | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     # Extract the reference genes. | ||||
| 
 | ||||
|     reference_data <- data[gene %chin% reference_gene_ids] | ||||
|     reference_data[, neural := 1.0] | ||||
| 
 | ||||
|     # Take out random samples from the remaining genes. This is another | ||||
|     # compromise with a negative impact on significance. Because there is no | ||||
|     # information on genes with are explicitely *not* TPE-OLD genes, we have to | ||||
|     # assume that a random sample of genes has a low probability of including | ||||
|     # TPE-OLD genes. | ||||
| 
 | ||||
|     without_reference_data <- data[!gene %chin% reference_gene_ids] | ||||
| 
 | ||||
|     reference_samples <- without_reference_data[ | ||||
|         sample( | ||||
|             nrow(without_reference_data), | ||||
|             nrow(reference_data) | ||||
|         ) | ||||
|     ] | ||||
| 
 | ||||
|     reference_samples[, neural := 0.0] | ||||
| 
 | ||||
|     # Merge training data. The training data includes all reference genes as | ||||
|     # well as an equal number of random sample genes. | ||||
|     training_data <- rbindlist(list(reference_data, reference_samples)) | ||||
| 
 | ||||
|     # Construct and train the neural network. | ||||
| 
 | ||||
|     nn_formula <- stats::as.formula(sprintf( | ||||
|         "neural~%s", | ||||
|         paste(species_ids_included, collapse = "+") | ||||
|     )) | ||||
| 
 | ||||
|     layer1 <- length(species_ids) * 0.66 | ||||
|     layer2 <- layer1 * 0.66 | ||||
|     layer3 <- layer2 * 0.66 | ||||
| 
 | ||||
|     nn <- neuralnet::neuralnet( | ||||
|         nn_formula, | ||||
|         training_data, | ||||
|         hidden = c(layer1, layer2, layer3), | ||||
|         linear.output = FALSE | ||||
|     ) | ||||
| 
 | ||||
|     # Return the resulting scores given by applying the neural network. | ||||
| 
 | ||||
|     data[, score := neuralnet::compute(nn, data)$net.result] | ||||
|     data[, .(gene, score)] | ||||
| } | ||||
							
								
								
									
										18
									
								
								R/proximity.R
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								R/proximity.R
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,18 @@ | |||
| # Score the mean distance of genes to the telomeres across species. | ||||
| # | ||||
| # A score will be given to each gene such that 0.0 corresponds to the maximal | ||||
| # mean distance across all genes and 1.0 corresponds to a distance of 0. | ||||
| proximity <- function(distances, preset) { | ||||
|     # Prefilter distances by species and gene. | ||||
|     distances <- distances[ | ||||
|         species %chin% preset$species_ids & gene %chin% preset$gene_ids | ||||
|     ] | ||||
| 
 | ||||
|     # Compute the score as described above. | ||||
| 
 | ||||
|     distances <- distances[, .(mean_distance = mean(distance)), by = "gene"] | ||||
|     max_distance <- distances[, max(mean_distance)] | ||||
|     distances[, score := 1 - mean_distance / max_distance] | ||||
| 
 | ||||
|     distances[, .(gene, score)] | ||||
| } | ||||
							
								
								
									
										63
									
								
								R/ranking.R
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										63
									
								
								R/ranking.R
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,63 @@ | |||
| #' Rank the results by computing a score. | ||||
| #' | ||||
| #' This function takes the result from [analyze()] and creates a score by | ||||
| #' computing a weighted mean across the different methods' results. | ||||
| #' | ||||
| #' @param results Results from [analyze()]. | ||||
| #' @param weights Named list pairing method names with weighting factors. | ||||
| #' | ||||
| #' @result The input data with an additional column containing the score and | ||||
| #'   another column containing the rank. | ||||
| #' | ||||
| #' @export | ||||
| ranking <- function(results, weights) { | ||||
|     results <- copy(results) | ||||
|     results[, score := 0.0] | ||||
| 
 | ||||
|     for (method in names(weights)) { | ||||
|         weighted <- weights[[method]] * results[, ..method] | ||||
|         results[, score := score + weighted] | ||||
|     } | ||||
| 
 | ||||
|     # Normalize scores to be between 0.0 and 1.0. | ||||
|     results[, score := score / sum(unlist(weights))] | ||||
| 
 | ||||
|     setorder(results, -score) | ||||
|     results[, rank := .I] | ||||
| } | ||||
| 
 | ||||
| #' Find the best weights to rank the results. | ||||
| #' | ||||
| #' This function finds the optimal parameters to [ranking()] that result in the | ||||
| #' reference genes ranking particulary high. | ||||
| #' | ||||
| #' @param results Results from [analyze()] or [ranking()]. | ||||
| #' @param methods Methods to include in the score. | ||||
| #' @param reference_gene_ids IDs of the reference genes. | ||||
| #' | ||||
| #' @returns Named list pairing method names with their optimal weights. | ||||
| #' | ||||
| #' @export | ||||
| optimize_weights <- function(results, methods, reference_gene_ids) { | ||||
|     # Create the named list from the factors vector. | ||||
|     weights <- function(factors) { | ||||
|         result <- NULL | ||||
| 
 | ||||
|         mapply(function(method, factor) { | ||||
|             result[[method]] <<- factor | ||||
|         }, methods, factors) | ||||
| 
 | ||||
|         result | ||||
|     } | ||||
| 
 | ||||
|     # Compute the mean rank of the reference genes when applying the weights. | ||||
|     mean_rank <- function(factors) { | ||||
|         data <- ranking(results, weights(factors)) | ||||
|         data[gene %chin% reference_gene_ids, mean(rank)] | ||||
|     } | ||||
| 
 | ||||
|     factors <- stats::optim(rep(1.0, length(methods)), mean_rank)$par | ||||
|     total_weight <- sum(factors) | ||||
| 
 | ||||
|     weights(factors / total_weight) | ||||
| } | ||||
							
								
								
									
										3
									
								
								R/utils.R
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								R/utils.R
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,3 @@ | |||
| # This is needed to make data.table's symbols available within the package. | ||||
| #' @import data.table | ||||
| NULL | ||||
							
								
								
									
										
											BIN
										
									
								
								data/distances.rda
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								data/distances.rda
									
										
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								data/genes.rda
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								data/genes.rda
									
										
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								data/species.rda
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								data/species.rda
									
										
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										19
									
								
								man/analyze.Rd
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								man/analyze.Rd
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,19 @@ | |||
| % Generated by roxygen2: do not edit by hand | ||||
| % Please edit documentation in R/analyze.R | ||||
| \name{analyze} | ||||
| \alias{analyze} | ||||
| \title{Analyze by applying the specified preset.} | ||||
| \usage{ | ||||
| analyze(preset) | ||||
| } | ||||
| \arguments{ | ||||
| \item{preset}{The preset to use which can be created using \code{\link[=preset]{preset()}}.} | ||||
| } | ||||
| \value{ | ||||
| A \link{data.table} with one row for each gene identified by it's ID | ||||
| (\code{gene} column). The additional columns contain the resulting scores per | ||||
| method and are named after the method IDs. | ||||
| } | ||||
| \description{ | ||||
| Analyze by applying the specified preset. | ||||
| } | ||||
							
								
								
									
										22
									
								
								man/distances.Rd
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								man/distances.Rd
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,22 @@ | |||
| % Generated by roxygen2: do not edit by hand | ||||
| % Please edit documentation in R/data.R | ||||
| \docType{data} | ||||
| \name{distances} | ||||
| \alias{distances} | ||||
| \title{Information on gene positions across species.} | ||||
| \format{ | ||||
| A \link{data.table} with 1390730 rows and 3 variables: | ||||
| \describe{ | ||||
| \item{species}{Species ID} | ||||
| \item{gene}{Gene ID} | ||||
| \item{distance}{Distance to nearest telomere} | ||||
| } | ||||
| } | ||||
| \usage{ | ||||
| distances | ||||
| } | ||||
| \description{ | ||||
| This dataset contains each known value for a gene's distance to the telomeres | ||||
| per species. The data is sourced from Ensembl. | ||||
| } | ||||
| \keyword{datasets} | ||||
							
								
								
									
										23
									
								
								man/genes.Rd
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								man/genes.Rd
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,23 @@ | |||
| % Generated by roxygen2: do not edit by hand | ||||
| % Please edit documentation in R/data.R | ||||
| \docType{data} | ||||
| \name{genes} | ||||
| \alias{genes} | ||||
| \title{Information on human genes within the Ensembl database.} | ||||
| \format{ | ||||
| A \link{data.table} with 60568 rows and 3 variables: | ||||
| \describe{ | ||||
| \item{id}{Ensembl gene ID} | ||||
| \item{name}{The gene's HGNC name} | ||||
| \item{chrosome}{The human chromosome the gene is located on} | ||||
| \item{n_species}{Number of known species with the gene.} | ||||
| } | ||||
| } | ||||
| \usage{ | ||||
| genes | ||||
| } | ||||
| \description{ | ||||
| This includes only genes on the primary suggested assembly of the human | ||||
| nuclear DNA. | ||||
| } | ||||
| \keyword{datasets} | ||||
							
								
								
									
										22
									
								
								man/optimize_weights.Rd
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								man/optimize_weights.Rd
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,22 @@ | |||
| % Generated by roxygen2: do not edit by hand | ||||
| % Please edit documentation in R/ranking.R | ||||
| \name{optimize_weights} | ||||
| \alias{optimize_weights} | ||||
| \title{Find the best weights to rank the results.} | ||||
| \usage{ | ||||
| optimize_weights(results, methods, reference_gene_ids) | ||||
| } | ||||
| \arguments{ | ||||
| \item{results}{Results from \code{\link[=analyze]{analyze()}} or \code{\link[=ranking]{ranking()}}.} | ||||
| 
 | ||||
| \item{methods}{Methods to include in the score.} | ||||
| 
 | ||||
| \item{reference_gene_ids}{IDs of the reference genes.} | ||||
| } | ||||
| \value{ | ||||
| Named list pairing method names with their optimal weights. | ||||
| } | ||||
| \description{ | ||||
| This function finds the optimal parameters to \code{\link[=ranking]{ranking()}} that result in the | ||||
| reference genes ranking particulary high. | ||||
| } | ||||
							
								
								
									
										34
									
								
								man/preset.Rd
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										34
									
								
								man/preset.Rd
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,34 @@ | |||
| % Generated by roxygen2: do not edit by hand | ||||
| % Please edit documentation in R/analyze.R | ||||
| \name{preset} | ||||
| \alias{preset} | ||||
| \title{Create a new preset.} | ||||
| \usage{ | ||||
| preset(methods, species, genes, reference_genes) | ||||
| } | ||||
| \arguments{ | ||||
| \item{methods}{IDs of methods to apply.} | ||||
| 
 | ||||
| \item{species}{IDs of species to include.} | ||||
| 
 | ||||
| \item{genes}{IDs of genes to screen.} | ||||
| 
 | ||||
| \item{reference_genes}{IDs of reference genes to compare to.} | ||||
| } | ||||
| \value{ | ||||
| The preset to use with \code{\link[=analyze]{analyze()}}. | ||||
| } | ||||
| \description{ | ||||
| A preset is used to specify which methods and inputs should be used for an | ||||
| analysis. Note that the genes to process should normally include the | ||||
| reference genes to be able to assess the results later. | ||||
| } | ||||
| \details{ | ||||
| Available methods are: | ||||
| \itemize{ | ||||
| \item \code{clusteriness} How much the gene distances cluster across species. | ||||
| \item \code{correlation} The mean correlation with the reference genes. | ||||
| \item \code{proximity} Mean proximity to telomeres. | ||||
| \item \code{neural} Assessment by neural network. | ||||
| } | ||||
| } | ||||
							
								
								
									
										17
									
								
								man/ranking.Rd
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										17
									
								
								man/ranking.Rd
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,17 @@ | |||
| % Generated by roxygen2: do not edit by hand | ||||
| % Please edit documentation in R/ranking.R | ||||
| \name{ranking} | ||||
| \alias{ranking} | ||||
| \title{Rank the results by computing a score.} | ||||
| \usage{ | ||||
| ranking(results, weights) | ||||
| } | ||||
| \arguments{ | ||||
| \item{results}{Results from \code{\link[=analyze]{analyze()}}.} | ||||
| 
 | ||||
| \item{weights}{Named list pairing method names with weighting factors.} | ||||
| } | ||||
| \description{ | ||||
| This function takes the result from \code{\link[=analyze]{analyze()}} and creates a score by | ||||
| computing a weighted mean across the different methods' results. | ||||
| } | ||||
							
								
								
									
										20
									
								
								man/species.Rd
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								man/species.Rd
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,20 @@ | |||
| % Generated by roxygen2: do not edit by hand | ||||
| % Please edit documentation in R/data.R | ||||
| \docType{data} | ||||
| \name{species} | ||||
| \alias{species} | ||||
| \title{Information on included species from the Ensembl database.} | ||||
| \format{ | ||||
| A \link{data.table} with 91 rows and 2 variables: | ||||
| \describe{ | ||||
| \item{id}{Unique species ID} | ||||
| \item{name}{Human readable species name} | ||||
| } | ||||
| } | ||||
| \usage{ | ||||
| species | ||||
| } | ||||
| \description{ | ||||
| Information on included species from the Ensembl database. | ||||
| } | ||||
| \keyword{datasets} | ||||
							
								
								
									
										154
									
								
								scripts/ensembl.R
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										154
									
								
								scripts/ensembl.R
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,154 @@ | |||
| library(data.table) | ||||
| 
 | ||||
| rlog::log_info("Connecting to Ensembl API") | ||||
| 
 | ||||
| #' Object to access the Ensembl API. | ||||
| ensembl <- biomaRt::useEnsembl("ensembl") | ||||
| 
 | ||||
| # Retrieve species information. | ||||
| 
 | ||||
| rlog::log_info("Retrieving species information") | ||||
| ensembl_datasets <- data.table(biomaRt::listDatasets(ensembl)) | ||||
| 
 | ||||
| # Filter out species ID and name from the result. | ||||
| species <- ensembl_datasets[, .( | ||||
|     id = stringr::str_match(dataset, "(.*)_gene_ensembl")[, 2], | ||||
|     name = stringr::str_match(description, "(.*) genes \\(.*\\)")[, 2] | ||||
| )] | ||||
| 
 | ||||
| #' Get all chromosome names for an Ensembl dataset. | ||||
| #' | ||||
| #' Valid chromosome names include decimal numbers as well as typical sex | ||||
| #' chromosome names (X, Y, W and Z). | ||||
| get_chromosome_names <- function(dataset) { | ||||
|     chromosome_names <- biomaRt::listFilterOptions(dataset, "chromosome_name") | ||||
|     chromosome_names[stringr::str_which(chromosome_names, "^[0-9]+|[XYWZ]$")] | ||||
| } | ||||
| 
 | ||||
| # Retrieve information on human genes. This will only include genes on | ||||
| # assembled chromosomes. Chromosomes are filtered using get_chromosome_names(). | ||||
| 
 | ||||
| rlog::log_info("Retrieving information on human genes") | ||||
| dataset <- biomaRt::useDataset("hsapiens_gene_ensembl", mart = ensembl) | ||||
| 
 | ||||
| human_data <- data.table(biomaRt::getBM( | ||||
|     attributes = c( | ||||
|         "ensembl_gene_id", | ||||
|         "hgnc_symbol", | ||||
|         "chromosome_name", | ||||
|         "start_position", | ||||
|         "end_position" | ||||
|     ), | ||||
|     filters = "chromosome_name", | ||||
|     values = get_chromosome_names(dataset), | ||||
|     mart = dataset | ||||
| )) | ||||
| 
 | ||||
| # Remove duplicated gene IDs (at the time of writing, there are a handful). | ||||
| human_data <- unique(human_data, by = "ensembl_gene_id") | ||||
| 
 | ||||
| # Only keep relevant information on genes. | ||||
| genes <- human_data[, .( | ||||
|     id = ensembl_gene_id, | ||||
|     name = hgnc_symbol, | ||||
|     chromosome = chromosome_name | ||||
| )] | ||||
| 
 | ||||
| # Retrieve gene distance data across species. | ||||
| 
 | ||||
| rlog::log_info("Retrieving distance data") | ||||
| 
 | ||||
| # Handle the human first, as we already retrieved the data and don't need to | ||||
| # filter based on orthologies. | ||||
| 
 | ||||
| human_data[, chromosome_length := max(end_position), by = chromosome_name] | ||||
| 
 | ||||
| distances <- human_data[, .( | ||||
|     species = "hsapiens", | ||||
|     gene = ensembl_gene_id, | ||||
|     distance = pmin( | ||||
|         start_position, | ||||
|         chromosome_length - end_position | ||||
|     ) | ||||
| )] | ||||
| 
 | ||||
| # Iterate through all other species and retrieve their distance data. | ||||
| for (species_id in species[!id == "hsapiens", id]) { | ||||
|     rlog::log_info(sprintf("Loading species \"%s\"", species_id)) | ||||
| 
 | ||||
|     dataset <- biomaRt::useDataset( | ||||
|         sprintf("%s_gene_ensembl", species_id), | ||||
|         mart = ensembl | ||||
|     ) | ||||
| 
 | ||||
|     # Besides the attributes that are always present, we need to check for | ||||
|     # human orthologs. Some species don't have that information and will be | ||||
|     # skipped. | ||||
|     if (!"hsapiens_homolog_ensembl_gene" %chin% | ||||
|         biomaRt::listAttributes(dataset, what = "name")) { | ||||
| 
 | ||||
|         rlog::log_info("No data on human orthologs") | ||||
|         species <- species[id != species_id] | ||||
| 
 | ||||
|         next | ||||
|     } | ||||
| 
 | ||||
|     chromosome_names <- get_chromosome_names(dataset) | ||||
| 
 | ||||
|     # Skip the species, if there are no assembled chromosomes. | ||||
|     if (length(chromosome_names) <= 0) { | ||||
|         rlog::log_info("No matching chromosome assemblies") | ||||
|         species <- species[id != species_id] | ||||
| 
 | ||||
|         next | ||||
|     } | ||||
| 
 | ||||
|     # Retrieve information on all genes of the current species, that have | ||||
|     # human orthologs. This is called "homolog" in the Ensembl schema. | ||||
|     species_distances <- data.table(biomaRt::getBM( | ||||
|         attributes = c( | ||||
|             "hsapiens_homolog_ensembl_gene", | ||||
|             "chromosome_name", | ||||
|             "start_position", | ||||
|             "end_position" | ||||
|         ), | ||||
|         filters = c("with_hsapiens_homolog", "chromosome_name"), | ||||
|         values = list(TRUE, chromosome_names), | ||||
|         mart = dataset | ||||
|     )) | ||||
| 
 | ||||
|     # Only include one ortholog per human gene. | ||||
|     species_distances <- unique( | ||||
|         species_distances, | ||||
|         by = "hsapiens_homolog_ensembl_gene" | ||||
|     ) | ||||
| 
 | ||||
|     # Precompute the genes' distance to the nearest telomere. | ||||
| 
 | ||||
|     species_distances[, | ||||
|         chromosome_length := max(end_position), | ||||
|         by = chromosome_name | ||||
|     ] | ||||
| 
 | ||||
|     species_distances <- species_distances[, .( | ||||
|         species = species_id, | ||||
|         gene = hsapiens_homolog_ensembl_gene, | ||||
|         distance = pmin( | ||||
|             start_position, | ||||
|             chromosome_length - end_position | ||||
|         ) | ||||
|     )] | ||||
| 
 | ||||
|     distances <- rbindlist(list(distances, species_distances)) | ||||
| } | ||||
| 
 | ||||
| # Add information on number of species per gene. | ||||
| 
 | ||||
| genes_n_species <- distances[, .(n_species = .N), by = "gene"] | ||||
| genes <- merge(genes, genes_n_species, by.x = "id", by.y = "gene") | ||||
| 
 | ||||
| # Save data in the appropriate place. | ||||
| 
 | ||||
| usethis::use_data(species, overwrite = TRUE) | ||||
| usethis::use_data(genes, overwrite = TRUE) | ||||
| usethis::use_data(distances, overwrite = TRUE) | ||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue