Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	
		kiyer
		
	commited on
		
		
					Commit 
							
							·
						
						d1fa2c0
	
1
								Parent(s):
							
							9c7a7db
								
major upgrade to v2.0
Browse files- absts/.DS_Store +0 -0
 - app.py +981 -53
 - local_files/astro_ph_ga_feeds_ada_embedding_27-Jun-2023.pkl → data/data-00000-of-00012.arrow +2 -2
 - local_files/astro_ph_ga_embedding_16-Jun-2024.pkl → data/data-00001-of-00012.arrow +2 -2
 - local_files/astro_ph_ga_embedding_27-Jun-2023.pkl → data/data-00002-of-00012.arrow +2 -2
 - local_files/astro_ph_ga_feeds_ada_embedding_16-Jun-2024.pkl → data/data-00003-of-00012.arrow +2 -2
 - data/data-00004-of-00012.arrow +3 -0
 - data/data-00005-of-00012.arrow +3 -0
 - data/data-00006-of-00012.arrow +3 -0
 - data/data-00007-of-00012.arrow +3 -0
 - data/data-00008-of-00012.arrow +3 -0
 - data/data-00009-of-00012.arrow +3 -0
 - data/data-00010-of-00012.arrow +3 -0
 - data/data-00011-of-00012.arrow +3 -0
 - data/dataset_info.json +188 -0
 - data/state.json +46 -0
 - local_files/astro_ph_ga_feeds_upto_16-Jun-2024.pkl +0 -3
 - local_files/astro_ph_ga_feeds_upto_27-Jun-2023.pkl +0 -3
 - pages/.ipynb_checkpoints/Untitled-checkpoint.ipynb +0 -6
 - pages/1_arxiv_embedding_explorer.py +0 -121
 - pages/2_paper_search.py +0 -201
 - pages/3_answering_questions.py +0 -352
 - pages/4_author_search.py +0 -138
 - pages/5_research_hotspots.py +0 -130
 - pages/6_qa_sources_v1.py +0 -286
 - pages/7_answering_questions_2024.py +0 -352
 - pages/8_arxiv_embedding_explorer_2024.py +0 -121
 - pages/9_research_hotspots_2024.py +0 -130
 - pages/Untitled.ipynb +0 -6
 - requirements.txt +10 -0
 
    	
        absts/.DS_Store
    DELETED
    
    | 
         Binary file (6.15 kB) 
     | 
| 
         | 
    	
        app.py
    CHANGED
    
    | 
         @@ -1,62 +1,990 @@ 
     | 
|
| 1 | 
         
             
            import streamlit as st
         
     | 
| 
         | 
|
| 2 | 
         | 
| 3 | 
         
            -
             
     | 
| 4 | 
         
            -
             
     | 
| 5 | 
         
            -
             
     | 
| 6 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 7 | 
         | 
| 8 | 
         
            -
             
     | 
| 9 | 
         
            -
             
     | 
| 10 | 
         
            -
            st.sidebar.success("Select a function above.")
         
     | 
| 11 | 
         
            -
            st.sidebar.markdown("Current functions include visualizing papers in the arxiv embedding, searching for similar papers to an input paper or prompt phrase, or answering quick questions.")
         
     | 
| 12 | 
         | 
| 13 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 14 | 
         
             
                """
         
     | 
| 15 | 
         
            -
                 
     | 
| 16 | 
         
            -
                visualizing papers on the [arXiv](https://arxiv.org/) using the context
         
     | 
| 17 | 
         
            -
                sensitivity from modern large language models (LLMs) to better link paper contexts.
         
     | 
| 18 | 
         
            -
             
     | 
| 19 | 
         
            -
                **👈 Select a tool from the sidebar** to see some examples
         
     | 
| 20 | 
         
            -
                of what this framework can do!
         
     | 
| 21 | 
         
            -
             
     | 
| 22 | 
         
            -
                ### Tool summary:
         
     | 
| 23 | 
         
            -
                - `Paper search` looks for relevant papers given an arxiv id or a question.
         
     | 
| 24 | 
         
            -
                - `Arxiv embedding` shows the landscape of current galaxy evolution papers (astro-ph.GA)
         
     | 
| 25 | 
         
            -
                - `Answering questions` brings it all together using RAG to give concise answers to questions with primary sources and relevant papers.
         
     | 
| 26 | 
         
            -
                - `Author search` uses a list of authors for the papers to visualize trajectories of individual researchers or groups over time.
         
     | 
| 27 | 
         
            -
                - `Research hotspots` uses paper ages to visualize excess research at a particular time in the past in different parts of the embedding space.
         
     | 
| 28 | 
         
            -
             
     | 
| 29 | 
         
            -
                This is not meant to be a replacement to existing tools like the
         
     | 
| 30 | 
         
            -
                [ADS](https://ui.adsabs.harvard.edu/),
         
     | 
| 31 | 
         
            -
                [arxivsorter](https://www.arxivsorter.org/), but rather a supplement to find papers
         
     | 
| 32 | 
         
            -
                that otherwise might be missed during a literature survey.
         
     | 
| 33 | 
         
            -
                It is also only trained on astro-ph.GA (astrophysics of galaxies) papers currently,
         
     | 
| 34 | 
         
            -
                if you are interested in extending it please reach out!
         
     | 
| 35 | 
         
            -
             
     | 
| 36 | 
         
            -
                The image below shows a representation of all the astro-ph.GA papers that can be explored in more detail
         
     | 
| 37 | 
         
            -
                using the `Arxiv embedding` page. The papers tend to cluster together by similarity, and result in an
         
     | 
| 38 | 
         
            -
                atlas that shows well studied (forests) and currently uncharted areas (water).
         
     | 
| 39 | 
         
             
                """
         
     | 
| 40 | 
         
            -
            )
         
     | 
| 41 | 
         | 
| 42 | 
         
            -
             
     | 
| 43 | 
         
            -
            st. 
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 44 | 
         | 
| 45 | 
         
            -
            st. 
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 46 | 
         
             
                """
         
     | 
| 47 | 
         
            -
             
     | 
| 48 | 
         
            -
                 
     | 
| 49 | 
         
            -
             
     | 
| 50 | 
         
            -
                 
     | 
| 51 | 
         
            -
             
     | 
| 52 | 
         
            -
             
     | 
| 53 | 
         
            -
                 
     | 
| 54 | 
         
            -
                 
     | 
| 55 | 
         
            -
                 
     | 
| 56 | 
         
            -
             
     | 
| 57 | 
         
            -
                 
     | 
| 58 | 
         
            -
             
     | 
| 59 | 
         
            -
                 
     | 
| 60 | 
         
            -
             
     | 
| 61 | 
         
            -
             
     | 
| 62 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
             
            import streamlit as st
         
     | 
| 2 | 
         
            +
            st.set_page_config(layout="wide")
         
     | 
| 3 | 
         | 
| 4 | 
         
            +
            import numpy as np
         
     | 
| 5 | 
         
            +
            from abc import ABC, abstractmethod
         
     | 
| 6 | 
         
            +
            from typing import List, Dict, Any, Tuple
         
     | 
| 7 | 
         
            +
            from collections import defaultdict
         
     | 
| 8 | 
         
            +
            from tqdm import tqdm
         
     | 
| 9 | 
         
            +
            import pandas as pd
         
     | 
| 10 | 
         
            +
            from datetime import datetime, date
         
     | 
| 11 | 
         
            +
            from datasets import load_dataset, load_from_disk
         
     | 
| 12 | 
         
            +
            from collections import Counter
         
     | 
| 13 | 
         | 
| 14 | 
         
            +
            import yaml, json, requests, sys, os, time
         
     | 
| 15 | 
         
            +
            import concurrent.futures
         
     | 
| 
         | 
|
| 
         | 
|
| 16 | 
         | 
| 17 | 
         
            +
            from langchain import hub
         
     | 
| 18 | 
         
            +
            from langchain_openai import ChatOpenAI as openai_llm
         
     | 
| 19 | 
         
            +
            from langchain_openai import OpenAIEmbeddings
         
     | 
| 20 | 
         
            +
            from langchain_core.runnables import RunnableConfig, RunnablePassthrough, RunnableParallel
         
     | 
| 21 | 
         
            +
            from langchain_core.prompts import PromptTemplate
         
     | 
| 22 | 
         
            +
            from langchain_community.callbacks import StreamlitCallbackHandler
         
     | 
| 23 | 
         
            +
            from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
         
     | 
| 24 | 
         
            +
            from langchain_community.vectorstores import Chroma
         
     | 
| 25 | 
         
            +
            from langchain_community.document_loaders import TextLoader
         
     | 
| 26 | 
         
            +
            from langchain.agents import create_react_agent, Tool, AgentExecutor
         
     | 
| 27 | 
         
            +
            from langchain.text_splitter import RecursiveCharacterTextSplitter
         
     | 
| 28 | 
         
            +
            from langchain_core.output_parsers import StrOutputParser
         
     | 
| 29 | 
         
            +
            from langchain.callbacks import FileCallbackHandler
         
     | 
| 30 | 
         
            +
            from langchain.callbacks.manager import CallbackManager
         
     | 
| 31 | 
         
            +
             
     | 
| 32 | 
         
            +
            import instructor
         
     | 
| 33 | 
         
            +
            from pydantic import BaseModel, Field
         
     | 
| 34 | 
         
            +
            from typing import List, Literal
         
     | 
| 35 | 
         
            +
             
     | 
| 36 | 
         
            +
            from nltk.corpus import stopwords
         
     | 
| 37 | 
         
            +
            import nltk
         
     | 
| 38 | 
         
            +
            from openai import OpenAI
         
     | 
| 39 | 
         
            +
            # import anthropic
         
     | 
| 40 | 
         
            +
            import cohere
         
     | 
| 41 | 
         
            +
            import faiss
         
     | 
| 42 | 
         
            +
             
     | 
| 43 | 
         
            +
            import spacy
         
     | 
| 44 | 
         
            +
            from string import punctuation
         
     | 
| 45 | 
         
            +
            import pytextrank
         
     | 
| 46 | 
         
            +
             
     | 
| 47 | 
         
            +
            from bokeh.plotting import figure
         
     | 
| 48 | 
         
            +
            from bokeh.models import ColumnDataSource
         
     | 
| 49 | 
         
            +
            from bokeh.io import output_notebook
         
     | 
| 50 | 
         
            +
            from bokeh.palettes import Spectral5
         
     | 
| 51 | 
         
            +
            from bokeh.transform import linear_cmap
         
     | 
| 52 | 
         
            +
             
     | 
| 53 | 
         
            +
            ts = time.time()
         
     | 
| 54 | 
         
            +
            st.session_state.ts = ts
         
     | 
| 55 | 
         
            +
             
     | 
| 56 | 
         
            +
            openai_key = st.secrets["openai_key"]
         
     | 
| 57 | 
         
            +
            # cohere_key = st.secrets['cohere_key']
         
     | 
| 58 | 
         
            +
            cohere_key = 'Of1MjzFjGmvzBAqdvNHTQLkAjecPcOKpiIPAnFMn'
         
     | 
| 59 | 
         
            +
             
     | 
| 60 | 
         
            +
            if 'nlp' not in st.session_state:
         
     | 
| 61 | 
         
            +
                nlp = spacy.load("en_core_web_sm")
         
     | 
| 62 | 
         
            +
                nlp.add_pipe("textrank")
         
     | 
| 63 | 
         
            +
                st.session_state.nlp = nlp
         
     | 
| 64 | 
         
            +
             
     | 
| 65 | 
         
            +
            try:
         
     | 
| 66 | 
         
            +
                stopwords.words('english')
         
     | 
| 67 | 
         
            +
            except:
         
     | 
| 68 | 
         
            +
                nltk.download('stopwords')
         
     | 
| 69 | 
         
            +
                stopwords.words('english')
         
     | 
| 70 | 
         
            +
             
     | 
| 71 | 
         
            +
            st.session_state.gen_llm = openai_llm(temperature=0,
         
     | 
| 72 | 
         
            +
                                 model_name='gpt-4o-mini', 
         
     | 
| 73 | 
         
            +
                                 openai_api_key = openai_key)
         
     | 
| 74 | 
         
            +
            st.session_state.consensus_client = instructor.patch(OpenAI(api_key=openai_key))
         
     | 
| 75 | 
         
            +
            st.session_state.embed_client = OpenAI(api_key = openai_key)
         
     | 
| 76 | 
         
            +
            embed_model = "text-embedding-3-small"
         
     | 
| 77 | 
         
            +
            st.session_state.embeddings = OpenAIEmbeddings(model = embed_model, api_key = openai_key)
         
     | 
| 78 | 
         
            +
             
     | 
| 79 | 
         
            +
            st.image('local_files/pathfinder_logo.png')
         
     | 
| 80 | 
         
            +
             
     | 
| 81 | 
         
            +
            st.expander("What is Pathfinder / How do I use it?", expanded=False).write(
         
     | 
| 82 | 
         
            +
                    """
         
     | 
| 83 | 
         
            +
                    Pathfinder v2.0 is a framework for searching and visualizing astronomy papers on the [arXiv](https://arxiv.org/) and [ADS](https://ui.adsabs.harvard.edu/) using the context
         
     | 
| 84 | 
         
            +
                    sensitivity from modern large language models (LLMs) to better parse patterns in paper contexts.
         
     | 
| 85 | 
         
            +
             
     | 
| 86 | 
         
            +
                    This tool was built during the [JSALT workshop](https://www.clsp.jhu.edu/2024-jelinek-summer-workshop-on-speech-and-language-technology/) to do awesome things.
         
     | 
| 87 | 
         
            +
             
     | 
| 88 | 
         
            +
                    **👈 Use the sidebar to tweak the search parameters to get better results**.
         
     | 
| 89 | 
         
            +
             
     | 
| 90 | 
         
            +
                    ### Tool summary:
         
     | 
| 91 | 
         
            +
                    - Please wait while the initial data loads and compiles, this takes about a minute initially.
         
     | 
| 92 | 
         
            +
             
     | 
| 93 | 
         
            +
                    This is not meant to be a replacement to existing tools like the
         
     | 
| 94 | 
         
            +
                    [ADS](https://ui.adsabs.harvard.edu/),
         
     | 
| 95 | 
         
            +
                    [arxivsorter](https://www.arxivsorter.org/), semantic search or google scholar, but rather a supplement to find papers
         
     | 
| 96 | 
         
            +
                    that otherwise might be missed during a literature survey.
         
     | 
| 97 | 
         
            +
                    It is trained on astro-ph (astrophysics of galaxies) papers up to last-year-ish mined from arxiv and supplemented with ADS metadata,
         
     | 
| 98 | 
         
            +
                    if you are interested in extending it please reach out!
         
     | 
| 99 | 
         
            +
             
     | 
| 100 | 
         
            +
                    Also add: feedback form, socials, literature, contact us, copyright, collaboration, etc.
         
     | 
| 101 | 
         
            +
             
     | 
| 102 | 
         
            +
                    The image below shows a representation of all the astro-ph.GA papers that can be explored in more detail
         
     | 
| 103 | 
         
            +
                    using the `Arxiv embedding` page. The papers tend to cluster together by similarity, and result in an
         
     | 
| 104 | 
         
            +
                    atlas that shows well studied (forests) and currently uncharted areas (water).
         
     | 
| 105 | 
         
            +
                    """
         
     | 
| 106 | 
         
            +
                )
         
     | 
| 107 | 
         
            +
             
     | 
| 108 | 
         
            +
             
     | 
| 109 | 
         
            +
            st.sidebar.header("Fine-tune the search")
         
     | 
| 110 | 
         
            +
            top_k = st.sidebar.slider("Number of papers to retrieve:", 3, 30, 10)
         
     | 
| 111 | 
         
            +
            extra_keywords = st.sidebar.text_input("Enter extra keywords (comma-separated):")
         
     | 
| 112 | 
         
            +
             
     | 
| 113 | 
         
            +
            st.sidebar.subheader("Toggles")
         
     | 
| 114 | 
         
            +
            toggle_a = st.sidebar.toggle("Weight by keywords", value = False)
         
     | 
| 115 | 
         
            +
            toggle_b = st.sidebar.toggle("Weight by date", value = False)
         
     | 
| 116 | 
         
            +
            toggle_c = st.sidebar.toggle("Weight by citations", value = False)
         
     | 
| 117 | 
         
            +
             
     | 
| 118 | 
         
            +
            method = st.sidebar.radio("Retrieval method:", ["Semantic search", "Semantic search + HyDE", "Semantic search + HyDE + CoHERE"], index=2)
         
     | 
| 119 | 
         
            +
             
     | 
| 120 | 
         
            +
            method2 = st.sidebar.radio("Generation complexity:", ["Basic RAG","ReAct Agent"])
         
     | 
| 121 | 
         
            +
             
     | 
| 122 | 
         
            +
            question_type = st.sidebar.selectbox("Select question type:", ["Multi-paper (Default)", "Single-paper", "Bibliometric", "Broad but nuanced"])
         
     | 
| 123 | 
         
            +
            st.session_state.question_type = question_type
         
     | 
| 124 | 
         
            +
            # store_output = st.sidebar.button("Save output")
         
     | 
| 125 | 
         
            +
             
     | 
| 126 | 
         
            +
            query = st.text_input("Ask me anything:")
         
     | 
| 127 | 
         
            +
            submit_button = st.button("Run pathfinder!")
         
     | 
| 128 | 
         
            +
             
     | 
| 129 | 
         
            +
            search_text_list = ['rooting around in the paper pile...','looking for clarity...','scanning the event horizon...','peering into the abyss...','potatoes power this ongoing search...']
         
     | 
| 130 | 
         
            +
             
     | 
| 131 | 
         
            +
            if 'arxiv_corpus' not in st.session_state:
         
     | 
| 132 | 
         
            +
                with st.spinner('loading data (please wait for this to finish before querying)...'):
         
     | 
| 133 | 
         
            +
                    # try:
         
     | 
| 134 | 
         
            +
                    arxiv_corpus = load_from_disk('data/')
         
     | 
| 135 | 
         
            +
                    # except:
         
     | 
| 136 | 
         
            +
                    #     st.write('downloading data')
         
     | 
| 137 | 
         
            +
                    #     arxiv_corpus = load_dataset('kiyer/pathfinder_arxiv_data',split='train')
         
     | 
| 138 | 
         
            +
                    #     # arxiv_corpus = load_dataset('kiyer/pathfinder_arxiv_data_galaxy',split='train')
         
     | 
| 139 | 
         
            +
                    #     arxiv_corpus.save_to_disk('data/')
         
     | 
| 140 | 
         
            +
                    arxiv_corpus.add_faiss_index('embed')
         
     | 
| 141 | 
         
            +
                    st.session_state.arxiv_corpus = arxiv_corpus
         
     | 
| 142 | 
         
            +
                    st.toast('loaded arxiv corpus')
         
     | 
| 143 | 
         
            +
             
     | 
| 144 | 
         
            +
            if 'ids' not in st.session_state:
         
     | 
| 145 | 
         
            +
                with st.spinner('making the LLM talk to the astro papers...'):
         
     | 
| 146 | 
         
            +
                    st.session_state.ids = st.session_state.arxiv_corpus['ads_id']
         
     | 
| 147 | 
         
            +
                    st.session_state.titles = st.session_state.arxiv_corpus['title']
         
     | 
| 148 | 
         
            +
                    st.session_state.abstracts = st.session_state.arxiv_corpus['abstract']
         
     | 
| 149 | 
         
            +
                    st.session_state.authors = st.session_state.arxiv_corpus['authors']
         
     | 
| 150 | 
         
            +
                    st.session_state.cites = st.session_state.arxiv_corpus['cites']
         
     | 
| 151 | 
         
            +
                    st.session_state.years = st.session_state.arxiv_corpus['date']
         
     | 
| 152 | 
         
            +
                    st.session_state.kws = st.session_state.arxiv_corpus['keywords']
         
     | 
| 153 | 
         
            +
                    st.session_state.ads_kws = st.session_state.arxiv_corpus['ads_keywords']
         
     | 
| 154 | 
         
            +
                    st.session_state.bibcode = st.session_state.arxiv_corpus['bibcode']
         
     | 
| 155 | 
         
            +
                    st.session_state.umap_x = st.session_state.arxiv_corpus['umap_x']
         
     | 
| 156 | 
         
            +
                    st.session_state.umap_y = st.session_state.arxiv_corpus['umap_y']
         
     | 
| 157 | 
         
            +
                    st.toast('done caching. time taken: %.2f sec' %(time.time()-ts))   
         
     | 
| 158 | 
         
            +
             
     | 
| 159 | 
         
            +
            def get_keywords(text):
         
     | 
| 160 | 
         
            +
                result = []
         
     | 
| 161 | 
         
            +
                pos_tag = ['PROPN', 'ADJ', 'NOUN']
         
     | 
| 162 | 
         
            +
                doc = st.session_state.nlp(text.lower())
         
     | 
| 163 | 
         
            +
                for token in doc:
         
     | 
| 164 | 
         
            +
                    if(token.text in st.session_state.nlp.Defaults.stop_words or token.text in punctuation):
         
     | 
| 165 | 
         
            +
                        continue
         
     | 
| 166 | 
         
            +
                    if(token.pos_ in pos_tag):
         
     | 
| 167 | 
         
            +
                        result.append(token.text)
         
     | 
| 168 | 
         
            +
                return result
         
     | 
| 169 | 
         
            +
             
     | 
| 170 | 
         
            +
            def parse_doc(text, nret = 10):
         
     | 
| 171 | 
         
            +
                local_kws = []
         
     | 
| 172 | 
         
            +
                doc = st.session_state.nlp(text)
         
     | 
| 173 | 
         
            +
                # examine the top-ranked phrases in the document
         
     | 
| 174 | 
         
            +
                for phrase in doc._.phrases[:nret]:
         
     | 
| 175 | 
         
            +
                    # print(phrase.text)
         
     | 
| 176 | 
         
            +
                    local_kws.append(phrase.text)
         
     | 
| 177 | 
         
            +
                return local_kws
         
     | 
| 178 | 
         
            +
             
     | 
| 179 | 
         
            +
            class EmbeddingRetrievalSystem():
         
     | 
| 180 | 
         
            +
             
     | 
| 181 | 
         
            +
                def __init__(self, weight_citation = False, weight_date = False, weight_keywords = False):
         
     | 
| 182 | 
         
            +
             
     | 
| 183 | 
         
            +
                    self.ids = st.session_state.ids
         
     | 
| 184 | 
         
            +
                    self.years = st.session_state.years
         
     | 
| 185 | 
         
            +
                    self.abstract = st.session_state.abstracts
         
     | 
| 186 | 
         
            +
                    self.client = OpenAI(api_key = openai_key)
         
     | 
| 187 | 
         
            +
                    self.embed_model = "text-embedding-3-small"
         
     | 
| 188 | 
         
            +
                    self.dataset = st.session_state.arxiv_corpus
         
     | 
| 189 | 
         
            +
                    self.kws = st.session_state.kws
         
     | 
| 190 | 
         
            +
                    self.cites = st.session_state.cites
         
     | 
| 191 | 
         
            +
             
     | 
| 192 | 
         
            +
                    self.weight_citation = weight_citation
         
     | 
| 193 | 
         
            +
                    self.weight_date = weight_date
         
     | 
| 194 | 
         
            +
                    self.weight_keywords = weight_keywords
         
     | 
| 195 | 
         
            +
                    self.id_to_index = {self.ids[i]: i for i in range(len(self.ids))}
         
     | 
| 196 | 
         
            +
             
     | 
| 197 | 
         
            +
                    # self.citation_filter = CitationFilter(self.dataset)
         
     | 
| 198 | 
         
            +
                    # self.date_filter = DateFilter(self.dataset['date'])
         
     | 
| 199 | 
         
            +
                    # self.keyword_filter = KeywordFilter(corpus=self.dataset, remove_capitals=True)
         
     | 
| 200 | 
         
            +
             
     | 
| 201 | 
         
            +
                def parse_date(self, id):
         
     | 
| 202 | 
         
            +
                    # indexval = np.where(self.ids == id)[0][0]
         
     | 
| 203 | 
         
            +
                    indexval = id
         
     | 
| 204 | 
         
            +
                    return self.years[indexval]
         
     | 
| 205 | 
         
            +
             
     | 
| 206 | 
         
            +
                def make_embedding(self, text):
         
     | 
| 207 | 
         
            +
                    str_embed = self.client.embeddings.create(input = [text], model = self.embed_model).data[0].embedding
         
     | 
| 208 | 
         
            +
                    return str_embed
         
     | 
| 209 | 
         
            +
             
     | 
| 210 | 
         
            +
                def embed_batch(self, texts: List[str]) -> List[np.ndarray]:
         
     | 
| 211 | 
         
            +
                    embeddings = self.client.embeddings.create(input=texts, model=self.embed_model).data
         
     | 
| 212 | 
         
            +
                    return [np.array(embedding.embedding, dtype=np.float32) for embedding in embeddings]
         
     | 
| 213 | 
         
            +
             
     | 
| 214 | 
         
            +
                def get_query_embedding(self, query):
         
     | 
| 215 | 
         
            +
                    return self.make_embedding(query)
         
     | 
| 216 | 
         
            +
             
     | 
| 217 | 
         
            +
                def analyze_temporal_query(self, query):
         
     | 
| 218 | 
         
            +
                    return
         
     | 
| 219 | 
         
            +
             
     | 
| 220 | 
         
            +
                def calc_faiss(self, query_embedding, top_k = 100):
         
     | 
| 221 | 
         
            +
                    # xq = query_embedding.reshape(-1,1).T.astype('float32')
         
     | 
| 222 | 
         
            +
                    # D, I = self.index.search(xq, top_k)
         
     | 
| 223 | 
         
            +
                    # return I[0], D[0]
         
     | 
| 224 | 
         
            +
                    tmp = self.dataset.search('embed', query_embedding, k=top_k)
         
     | 
| 225 | 
         
            +
                    return [tmp.indices, tmp.scores]
         
     | 
| 226 | 
         
            +
             
     | 
| 227 | 
         
            +
                def rank_and_filter(self, query, query_embedding, query_date, top_k = 10, return_scores=False, time_result=None):
         
     | 
| 228 | 
         
            +
             
     | 
| 229 | 
         
            +
                    # st.write('status')
         
     | 
| 230 | 
         
            +
             
     | 
| 231 | 
         
            +
                    # st.write('toggles', self.toggles)
         
     | 
| 232 | 
         
            +
                    # st.write('question_type', self.question_type)
         
     | 
| 233 | 
         
            +
                    # st.write('rag method', self.rag_method)
         
     | 
| 234 | 
         
            +
                    # st.write('gen method', self.gen_method)
         
     | 
| 235 | 
         
            +
             
     | 
| 236 | 
         
            +
                    self.weight_keywords = self.toggles["Keyword weighting"]
         
     | 
| 237 | 
         
            +
                    self.weight_date = self.toggles["Time weighting"]
         
     | 
| 238 | 
         
            +
                    self.weight_citation = self.toggles["Citation weighting"]
         
     | 
| 239 | 
         
            +
             
     | 
| 240 | 
         
            +
                    topk_indices, similarities = self.calc_faiss(np.array(query_embedding), top_k = 1000)
         
     | 
| 241 | 
         
            +
                    similarities = 1/similarities # converting from a distance (less is better) to a similarity (more is better)
         
     | 
| 242 | 
         
            +
             
     | 
| 243 | 
         
            +
                    query_kws = get_keywords(query)
         
     | 
| 244 | 
         
            +
                    input_kws = self.query_input_keywords
         
     | 
| 245 | 
         
            +
                    query_kws = query_kws + input_kws
         
     | 
| 246 | 
         
            +
                    self.query_kws = query_kws
         
     | 
| 247 | 
         
            +
             
     | 
| 248 | 
         
            +
                    if self.weight_keywords == True:
         
     | 
| 249 | 
         
            +
                        sub_kws = [self.kws[i] for i in topk_indices]
         
     | 
| 250 | 
         
            +
                        kw_weight = np.zeros((len(topk_indices),)) + 0.1
         
     | 
| 251 | 
         
            +
             
     | 
| 252 | 
         
            +
                        for k in query_kws:
         
     | 
| 253 | 
         
            +
                            for i in (range(len(topk_indices))):
         
     | 
| 254 | 
         
            +
                                for j in range(len(sub_kws[i])):
         
     | 
| 255 | 
         
            +
                                    if k.lower() in sub_kws[i][j].lower():
         
     | 
| 256 | 
         
            +
                                        kw_weight[i] = kw_weight[i] + 0.1
         
     | 
| 257 | 
         
            +
                                        # print(i, k, sub_kws[i][j])
         
     | 
| 258 | 
         
            +
             
     | 
| 259 | 
         
            +
                        # kw_weight = kw_weight**0.36 / np.amax(kw_weight**0.36)
         
     | 
| 260 | 
         
            +
                        kw_weight = kw_weight / np.amax(kw_weight)
         
     | 
| 261 | 
         
            +
                    else:
         
     | 
| 262 | 
         
            +
                        kw_weight = np.ones((len(topk_indices),))
         
     | 
| 263 | 
         
            +
             
     | 
| 264 | 
         
            +
                    if self.weight_date == True:
         
     | 
| 265 | 
         
            +
                        sub_dates = [self.years[i] for i in topk_indices]
         
     | 
| 266 | 
         
            +
                        date = datetime.now().date()
         
     | 
| 267 | 
         
            +
                        date_diff = np.array([((date - i).days / 365.) for i in sub_dates])
         
     | 
| 268 | 
         
            +
                        # age_weight = (1 + np.exp(date_diff/2.1))**(-1) + 0.5
         
     | 
| 269 | 
         
            +
                        age_weight = (1 + np.exp(date_diff/0.7))**(-1)
         
     | 
| 270 | 
         
            +
                        age_weight = age_weight / np.amax(age_weight)
         
     | 
| 271 | 
         
            +
                    else:
         
     | 
| 272 | 
         
            +
                        age_weight = np.ones((len(topk_indices),))
         
     | 
| 273 | 
         
            +
             
     | 
| 274 | 
         
            +
                    if self.weight_citation == True:
         
     | 
| 275 | 
         
            +
                        # st.write('weighting by citations')
         
     | 
| 276 | 
         
            +
                        sub_cites = np.array([self.cites[i] for i in topk_indices])
         
     | 
| 277 | 
         
            +
                        temp = sub_cites.copy()
         
     | 
| 278 | 
         
            +
                        temp[sub_cites > 300] = 300.
         
     | 
| 279 | 
         
            +
                        cite_weight = (1 + np.exp((300-temp)/42.0))**(-1.)
         
     | 
| 280 | 
         
            +
                        cite_weight = cite_weight / np.amax(cite_weight)
         
     | 
| 281 | 
         
            +
                    else:
         
     | 
| 282 | 
         
            +
                        cite_weight = np.ones((len(topk_indices),))
         
     | 
| 283 | 
         
            +
             
     | 
| 284 | 
         
            +
                    similarities = similarities * (kw_weight) * (age_weight) * (cite_weight)
         
     | 
| 285 | 
         
            +
             
     | 
| 286 | 
         
            +
                    filtered_results = [[topk_indices[i], similarities[i]] for i in range(len(similarities))]
         
     | 
| 287 | 
         
            +
                    top_results = sorted(filtered_results, key=lambda x: x[1], reverse=True)[:top_k]
         
     | 
| 288 | 
         
            +
             
     | 
| 289 | 
         
            +
                    if return_scores:
         
     | 
| 290 | 
         
            +
                        return {doc[0]: doc[1] for doc in top_results}
         
     | 
| 291 | 
         
            +
             
     | 
| 292 | 
         
            +
                    # Only keep the document IDs
         
     | 
| 293 | 
         
            +
                    top_results = [doc[0] for doc in top_results]
         
     | 
| 294 | 
         
            +
                    return top_results
         
     | 
| 295 | 
         
            +
             
     | 
| 296 | 
         
            +
                def retrieve(self, query, top_k, time_result=None, query_date = None, return_scores = False):
         
     | 
| 297 | 
         
            +
             
     | 
| 298 | 
         
            +
                    query_embedding = self.get_query_embedding(query)
         
     | 
| 299 | 
         
            +
             
     | 
| 300 | 
         
            +
                    # Judge time relevance
         
     | 
| 301 | 
         
            +
                    if time_result is None:
         
     | 
| 302 | 
         
            +
                        if self.weight_date:
         
     | 
| 303 | 
         
            +
                            time_result, time_taken = self.analyze_temporal_query(query, self.anthropic_client)
         
     | 
| 304 | 
         
            +
                        else:
         
     | 
| 305 | 
         
            +
                            time_result = {'has_temporal_aspect': False, 'expected_year_filter': None, 'expected_recency_weight': None}
         
     | 
| 306 | 
         
            +
             
     | 
| 307 | 
         
            +
                    top_results = self.rank_and_filter(query,
         
     | 
| 308 | 
         
            +
                                                       query_embedding,
         
     | 
| 309 | 
         
            +
                                                       query_date,
         
     | 
| 310 | 
         
            +
                                                       top_k,
         
     | 
| 311 | 
         
            +
                                                       return_scores = return_scores,
         
     | 
| 312 | 
         
            +
                                                       time_result = time_result)
         
     | 
| 313 | 
         
            +
             
     | 
| 314 | 
         
            +
                    return top_results
         
     | 
| 315 | 
         
            +
             
     | 
| 316 | 
         
            +
            class HydeRetrievalSystem(EmbeddingRetrievalSystem):
         
     | 
| 317 | 
         
            +
                def __init__(self, generation_model: str = "claude-3-haiku-20240307",
         
     | 
| 318 | 
         
            +
                             embedding_model: str = "text-embedding-3-small",
         
     | 
| 319 | 
         
            +
                         temperature: float = 0.5,
         
     | 
| 320 | 
         
            +
                             max_doclen: int = 500,
         
     | 
| 321 | 
         
            +
                             generate_n: int = 1,
         
     | 
| 322 | 
         
            +
                             embed_query = True,
         
     | 
| 323 | 
         
            +
                             conclusion = False, **kwargs):
         
     | 
| 324 | 
         
            +
             
     | 
| 325 | 
         
            +
                    # Handle the kwargs for the superclass init -- filters/citation weighting
         
     | 
| 326 | 
         
            +
                    super().__init__(**kwargs)
         
     | 
| 327 | 
         
            +
             
     | 
| 328 | 
         
            +
                    if max_doclen * generate_n > 8191:
         
     | 
| 329 | 
         
            +
                        raise ValueError("Too many tokens. Please reduce max_doclen or generate_n.")
         
     | 
| 330 | 
         
            +
             
     | 
| 331 | 
         
            +
                    self.embedding_model = embedding_model
         
     | 
| 332 | 
         
            +
                    self.generation_model = generation_model
         
     | 
| 333 | 
         
            +
             
     | 
| 334 | 
         
            +
                    # HYPERPARAMETERS
         
     | 
| 335 | 
         
            +
                    self.temperature = temperature # generation temperature
         
     | 
| 336 | 
         
            +
                    self.max_doclen = max_doclen # max tokens for generation
         
     | 
| 337 | 
         
            +
                    self.generate_n = generate_n # how many documents
         
     | 
| 338 | 
         
            +
                    self.embed_query = embed_query # embed the query vector?
         
     | 
| 339 | 
         
            +
                    self.conclusion = conclusion # generate conclusion as well?
         
     | 
| 340 | 
         
            +
             
     | 
| 341 | 
         
            +
                    # self.anthropic_key = anthropic_key
         
     | 
| 342 | 
         
            +
                    # self.generation_client = anthropic.Anthropic(api_key = self.anthropic_key)
         
     | 
| 343 | 
         
            +
                    self.generation_client = openai_llm(temperature=0,model_name='gpt-4o-mini', openai_api_key = openai_key)
         
     | 
| 344 | 
         
            +
             
     | 
| 345 | 
         
            +
                def retrieve(self, query: str, top_k: int = 10, return_scores = False, time_result = None) -> List[Tuple[str, str, float]]:
         
     | 
| 346 | 
         
            +
                    if time_result is None:
         
     | 
| 347 | 
         
            +
                        if self.weight_date: time_result, time_taken = analyze_temporal_query(query, self.anthropic_client)
         
     | 
| 348 | 
         
            +
                        else: time_result = {'has_temporal_aspect': False, 'expected_year_filter': None, 'expected_recency_weight': None}
         
     | 
| 349 | 
         
            +
             
     | 
| 350 | 
         
            +
                    docs = self.generate_docs(query)
         
     | 
| 351 | 
         
            +
                    st.expander('Abstract generated with hyde', expanded=False).write(docs)
         
     | 
| 352 | 
         
            +
             
     | 
| 353 | 
         
            +
                    doc_embeddings = self.embed_docs(docs)
         
     | 
| 354 | 
         
            +
             
     | 
| 355 | 
         
            +
                    if self.embed_query:
         
     | 
| 356 | 
         
            +
                        query_emb = self.embed_docs([query])[0]
         
     | 
| 357 | 
         
            +
                        doc_embeddings.append(query_emb)
         
     | 
| 358 | 
         
            +
             
     | 
| 359 | 
         
            +
                    embedding = np.mean(np.array(doc_embeddings), axis = 0)
         
     | 
| 360 | 
         
            +
             
     | 
| 361 | 
         
            +
                    top_results = self.rank_and_filter(query, embedding, query_date=None, top_k = top_k, return_scores = return_scores, time_result = time_result)
         
     | 
| 362 | 
         
            +
             
     | 
| 363 | 
         
            +
                    return top_results
         
     | 
| 364 | 
         
            +
             
     | 
| 365 | 
         
            +
                def generate_doc(self, query: str):
         
     | 
| 366 | 
         
            +
                    prompt = """You are an expert astronomer. Given a scientific query, generate the abstract of an expert-level research paper
         
     | 
| 367 | 
         
            +
                                        that answers the question. Stick to a maximum length of {} tokens and return just the text of the abstract and conclusion.
         
     | 
| 368 | 
         
            +
                                        Do not include labels for any section. Use research-specific jargon.""".format(self.max_doclen)
         
     | 
| 369 | 
         
            +
                    # st.write('invoking hyde generation')
         
     | 
| 370 | 
         
            +
             
     | 
| 371 | 
         
            +
                    # message = self.generation_client.messages.create(
         
     | 
| 372 | 
         
            +
                    #         model = self.generation_model,
         
     | 
| 373 | 
         
            +
                    #         max_tokens = self.max_doclen,
         
     | 
| 374 | 
         
            +
                    #         temperature = self.temperature,
         
     | 
| 375 | 
         
            +
                    #         system = prompt,
         
     | 
| 376 | 
         
            +
                    #         messages=[{ "role": "user",
         
     | 
| 377 | 
         
            +
                    #                 "content": [{"type": "text", "text": query,}] }]
         
     | 
| 378 | 
         
            +
                    #     )
         
     | 
| 379 | 
         
            +
                    # return message.content[0].text
         
     | 
| 380 | 
         
            +
             
     | 
| 381 | 
         
            +
                    messages = [("system",prompt,),("human", query),]
         
     | 
| 382 | 
         
            +
                    return self.generation_client.invoke(messages).content
         
     | 
| 383 | 
         
            +
             
     | 
| 384 | 
         
            +
             
     | 
| 385 | 
         
            +
             
     | 
| 386 | 
         
            +
                def generate_docs(self, query: str):
         
     | 
| 387 | 
         
            +
                    docs = []
         
     | 
| 388 | 
         
            +
                    for i in range(self.generate_n):
         
     | 
| 389 | 
         
            +
                        docs.append(self.generate_doc(query))
         
     | 
| 390 | 
         
            +
                    return docs
         
     | 
| 391 | 
         
            +
             
     | 
| 392 | 
         
            +
                def embed_docs(self, docs: List[str]):
         
     | 
| 393 | 
         
            +
                    return self.embed_batch(docs)
         
     | 
| 394 | 
         
            +
             
     | 
| 395 | 
         
            +
            class HydeCohereRetrievalSystem(HydeRetrievalSystem):
         
     | 
| 396 | 
         
            +
                def __init__(self, **kwargs):
         
     | 
| 397 | 
         
            +
                    super().__init__(**kwargs)
         
     | 
| 398 | 
         
            +
             
     | 
| 399 | 
         
            +
                    self.cohere_key = cohere_key
         
     | 
| 400 | 
         
            +
                    self.cohere_client = cohere.Client(self.cohere_key)
         
     | 
| 401 | 
         
            +
             
     | 
| 402 | 
         
            +
                def retrieve(self, query: str,
         
     | 
| 403 | 
         
            +
                             top_k: int = 10,
         
     | 
| 404 | 
         
            +
                             rerank_top_k: int = 250,
         
     | 
| 405 | 
         
            +
                             return_scores = False, time_result = None,
         
     | 
| 406 | 
         
            +
                             reweight = False) -> List[Tuple[str, str, float]]:
         
     | 
| 407 | 
         
            +
             
     | 
| 408 | 
         
            +
                    if time_result is None:
         
     | 
| 409 | 
         
            +
                        if self.weight_date: time_result, time_taken = analyze_temporal_query(query, self.anthropic_client)
         
     | 
| 410 | 
         
            +
                        else: time_result = {'has_temporal_aspect': False, 'expected_year_filter': None, 'expected_recency_weight': None}
         
     | 
| 411 | 
         
            +
             
     | 
| 412 | 
         
            +
                    top_results = super().retrieve(query, top_k = rerank_top_k, time_result = time_result)
         
     | 
| 413 | 
         
            +
             
     | 
| 414 | 
         
            +
                    # doc_texts = self.get_document_texts(top_results)
         
     | 
| 415 | 
         
            +
                    # docs_for_rerank = [f"Abstract: {doc['abstract']}\nConclusions: {doc['conclusions']}" for doc in doc_texts]
         
     | 
| 416 | 
         
            +
                    docs_for_rerank = [self.abstract[i] for i in top_results]
         
     | 
| 417 | 
         
            +
             
     | 
| 418 | 
         
            +
                    if len(docs_for_rerank) == 0:
         
     | 
| 419 | 
         
            +
                        return []
         
     | 
| 420 | 
         
            +
             
     | 
| 421 | 
         
            +
                    reranked_results = self.cohere_client.rerank(
         
     | 
| 422 | 
         
            +
                        query=query,
         
     | 
| 423 | 
         
            +
                        documents=docs_for_rerank,
         
     | 
| 424 | 
         
            +
                        model='rerank-english-v3.0',
         
     | 
| 425 | 
         
            +
                        top_n=top_k
         
     | 
| 426 | 
         
            +
                    )
         
     | 
| 427 | 
         
            +
             
     | 
| 428 | 
         
            +
                    final_results = []
         
     | 
| 429 | 
         
            +
                    for result in reranked_results.results:
         
     | 
| 430 | 
         
            +
                        doc_id = top_results[result.index]
         
     | 
| 431 | 
         
            +
                        doc_text = docs_for_rerank[result.index]
         
     | 
| 432 | 
         
            +
                        score = float(result.relevance_score)
         
     | 
| 433 | 
         
            +
                        final_results.append([doc_id, "", score])
         
     | 
| 434 | 
         
            +
             
     | 
| 435 | 
         
            +
                    if reweight:
         
     | 
| 436 | 
         
            +
                        if time_result['has_temporal_aspect']:
         
     | 
| 437 | 
         
            +
                            final_results = self.date_filter.filter(final_results, time_score = time_result['expected_recency_weight'])
         
     | 
| 438 | 
         
            +
             
     | 
| 439 | 
         
            +
                        if self.weight_citation: self.citation_filter.filter(final_results)
         
     | 
| 440 | 
         
            +
             
     | 
| 441 | 
         
            +
                    if return_scores:
         
     | 
| 442 | 
         
            +
                        return {result[0]: result[2] for result in final_results}
         
     | 
| 443 | 
         
            +
             
     | 
| 444 | 
         
            +
                    return [doc[0] for doc in final_results]
         
     | 
| 445 | 
         
            +
             
     | 
| 446 | 
         
            +
                def embed_docs(self, docs: List[str]):
         
     | 
| 447 | 
         
            +
                    return self.embed_batch(docs)
         
     | 
| 448 | 
         
            +
             
     | 
| 449 | 
         
            +
            # --------- other fns ------------------
         
     | 
| 450 | 
         
            +
             
     | 
| 451 | 
         
            +
            def get_topk(query, top_k):
         
     | 
| 452 | 
         
            +
                print('running retrieval')
         
     | 
| 453 | 
         
            +
                rs = st.session_state.ec.retrieve(query, top_k, return_scores=True)
         
     | 
| 454 | 
         
            +
                return rs
         
     | 
| 455 | 
         
            +
             
     | 
| 456 | 
         
            +
            def Library(query, top_k = 7):
         
     | 
| 457 | 
         
            +
                rs = get_topk(query, top_k = top_k)
         
     | 
| 458 | 
         
            +
                op_docs = ''
         
     | 
| 459 | 
         
            +
                for paperno, i in enumerate(rs):
         
     | 
| 460 | 
         
            +
                    op_docs = op_docs + 'Paper %.0f:' %(paperno+1) +' (published in '+st.session_state.bibcode[i][0:4] + ') ' + st.session_state.titles[i]  + '\n' + st.session_state.abstracts[i] + '\n\n'
         
     | 
| 461 | 
         
            +
             
     | 
| 462 | 
         
            +
                return op_docs
         
     | 
| 463 | 
         
            +
             
     | 
| 464 | 
         
            +
            def Library2(query, top_k = 7):
         
     | 
| 465 | 
         
            +
                rs = get_topk(query, top_k = top_k)
         
     | 
| 466 | 
         
            +
                absts, fnames = [], []
         
     | 
| 467 | 
         
            +
                for paperno, i in enumerate(rs):
         
     | 
| 468 | 
         
            +
                    absts.append(st.session_state.abstracts[i])
         
     | 
| 469 | 
         
            +
                    fnames.append(st.session_state.bibcode[i])
         
     | 
| 470 | 
         
            +
                return absts, fnames, rs
         
     | 
| 471 | 
         
            +
             
     | 
| 472 | 
         
            +
            def get_paper_df(ids):
         
     | 
| 473 | 
         
            +
             
     | 
| 474 | 
         
            +
                papers, scores, yrs, links, cites, kws, authors, absts = [], [], [], [], [], [], [], []
         
     | 
| 475 | 
         
            +
                for i in ids:
         
     | 
| 476 | 
         
            +
                    papers.append(st.session_state.titles[i])
         
     | 
| 477 | 
         
            +
                    scores.append(ids[i])
         
     | 
| 478 | 
         
            +
                    links.append('https://ui.adsabs.harvard.edu/abs/'+st.session_state.bibcode[i]+'/abstract')
         
     | 
| 479 | 
         
            +
                    yrs.append(st.session_state.bibcode[i][0:4])
         
     | 
| 480 | 
         
            +
                    cites.append(st.session_state.cites[i])
         
     | 
| 481 | 
         
            +
                    authors.append(st.session_state.authors[i][0])
         
     | 
| 482 | 
         
            +
                    kws.append(st.session_state.ads_kws[i])
         
     | 
| 483 | 
         
            +
                    absts.append(st.session_state.abstracts[i])
         
     | 
| 484 | 
         
            +
             
     | 
| 485 | 
         
            +
                return pd.DataFrame({
         
     | 
| 486 | 
         
            +
                    'Title': papers,
         
     | 
| 487 | 
         
            +
                    'Relevance': scores,
         
     | 
| 488 | 
         
            +
                    'Lead author': authors,
         
     | 
| 489 | 
         
            +
                    'Year': yrs,
         
     | 
| 490 | 
         
            +
                    'ADS Link': links,
         
     | 
| 491 | 
         
            +
                    'Citations': cites,
         
     | 
| 492 | 
         
            +
                    'Keywords': kws,
         
     | 
| 493 | 
         
            +
                    'Abstract': absts
         
     | 
| 494 | 
         
            +
                })
         
     | 
| 495 | 
         
            +
             
     | 
| 496 | 
         
            +
            def extract_keywords(question, ec):
         
     | 
| 497 | 
         
            +
                # Simulated keyword extraction (replace with actual logic)
         
     | 
| 498 | 
         
            +
                return ['keyword1', 'keyword2', 'keyword3']
         
     | 
| 499 | 
         
            +
             
     | 
| 500 | 
         
            +
            # Function to estimate consensus (replace with actual implementation)
         
     | 
| 501 | 
         
            +
            def estimate_consensus():
         
     | 
| 502 | 
         
            +
                # Simulated consensus estimation (replace with actual calculation)
         
     | 
| 503 | 
         
            +
                return 0.75
         
     | 
| 504 | 
         
            +
             
     | 
| 505 | 
         
            +
             
     | 
| 506 | 
         
            +
            def run_agent_qa(query, top_k):
         
     | 
| 507 | 
         
            +
             
     | 
| 508 | 
         
            +
                # define tools
         
     | 
| 509 | 
         
            +
                search = DuckDuckGoSearchAPIWrapper()
         
     | 
| 510 | 
         
            +
                tools = [
         
     | 
| 511 | 
         
            +
                    Tool(
         
     | 
| 512 | 
         
            +
                        name="Library",
         
     | 
| 513 | 
         
            +
                        func=Library,
         
     | 
| 514 | 
         
            +
                        description="A source of information pertinent to your question. Do not answer a question without consulting this!"
         
     | 
| 515 | 
         
            +
                    ),
         
     | 
| 516 | 
         
            +
                    Tool(
         
     | 
| 517 | 
         
            +
                        name="Search",
         
     | 
| 518 | 
         
            +
                        func=search.run,
         
     | 
| 519 | 
         
            +
                        description="useful for when you need to look up knowledge about common topics or current events",
         
     | 
| 520 | 
         
            +
                    )
         
     | 
| 521 | 
         
            +
                ]
         
     | 
| 522 | 
         
            +
             
     | 
| 523 | 
         
            +
                if 'tools' not in st.session_state:
         
     | 
| 524 | 
         
            +
                    st.session_state.tools = tools
         
     | 
| 525 | 
         
            +
             
     | 
| 526 | 
         
            +
                # define prompt
         
     | 
| 527 | 
         
            +
             
     | 
| 528 | 
         
            +
                # for another question type:
         
     | 
| 529 | 
         
            +
                # First, find the quotes from the document that are most relevant to answering the question, and then print them in numbered order.
         
     | 
| 530 | 
         
            +
                # Quotes should be relatively short. If there are no relevant quotes, write “No relevant quotes” instead.
         
     | 
| 531 | 
         
            +
             
     | 
| 532 | 
         
            +
             
     | 
| 533 | 
         
            +
                template = """You are an expert astronomer and cosmologist.
         
     | 
| 534 | 
         
            +
                Answer the following question as best you can using information from the library, but speaking in a concise and factual manner.
         
     | 
| 535 | 
         
            +
                If you can not come up with an answer, say you do not know.
         
     | 
| 536 | 
         
            +
                Try to break the question down into smaller steps and solve it in a logical manner.
         
     | 
| 537 | 
         
            +
             
     | 
| 538 | 
         
            +
                You have access to the following tools:
         
     | 
| 539 | 
         
            +
             
     | 
| 540 | 
         
            +
                {tools}
         
     | 
| 541 | 
         
            +
             
     | 
| 542 | 
         
            +
                Use the following format:
         
     | 
| 543 | 
         
            +
             
     | 
| 544 | 
         
            +
                Question: the input question you must answer
         
     | 
| 545 | 
         
            +
                Thought: you should always think about what to do
         
     | 
| 546 | 
         
            +
                Action: the action to take, should be one of [{tool_names}]
         
     | 
| 547 | 
         
            +
                Action Input: the input to the action
         
     | 
| 548 | 
         
            +
                Observation: the result of the action
         
     | 
| 549 | 
         
            +
                ... (this Thought/Action/Action Input/Observation can repeat N times)
         
     | 
| 550 | 
         
            +
                Thought: I now know the final answer
         
     | 
| 551 | 
         
            +
                Final Answer: the final answer to the original input question. provide information about how you arrived at the answer, and any nuances or uncertainties the reader should be aware of
         
     | 
| 552 | 
         
            +
             
     | 
| 553 | 
         
            +
                Begin! Remember to speak in a pedagogical and factual manner."
         
     | 
| 554 | 
         
            +
             
     | 
| 555 | 
         
            +
                Question: {input}
         
     | 
| 556 | 
         
            +
                Thought:{agent_scratchpad}"""
         
     | 
| 557 | 
         
            +
             
     | 
| 558 | 
         
            +
                prompt = hub.pull("hwchase17/react")
         
     | 
| 559 | 
         
            +
                prompt.template=template
         
     | 
| 560 | 
         
            +
             
     | 
| 561 | 
         
            +
                # path to write intermediate trace to
         
     | 
| 562 | 
         
            +
             
     | 
| 563 | 
         
            +
                file_path = "agent_trace.txt"
         
     | 
| 564 | 
         
            +
                try:
         
     | 
| 565 | 
         
            +
                    os.remove(file_path)
         
     | 
| 566 | 
         
            +
                except:
         
     | 
| 567 | 
         
            +
                    pass
         
     | 
| 568 | 
         
            +
                file_handler = FileCallbackHandler(file_path)
         
     | 
| 569 | 
         
            +
                callback_manager=CallbackManager([file_handler])
         
     | 
| 570 | 
         
            +
             
     | 
| 571 | 
         
            +
                # define and execute agent
         
     | 
| 572 | 
         
            +
             
     | 
| 573 | 
         
            +
                tool_names = [tool.name for tool in st.session_state.tools]
         
     | 
| 574 | 
         
            +
                if 'agent' not in st.session_state:
         
     | 
| 575 | 
         
            +
                    # agent = ZeroShotAgent(llm_chain=llm_chain, allowed_tools=tool_names)
         
     | 
| 576 | 
         
            +
                    agent = create_react_agent(llm=st.session_state.gen_llm, tools=tools, prompt=prompt)
         
     | 
| 577 | 
         
            +
                    st.session_state.agent = agent
         
     | 
| 578 | 
         
            +
             
     | 
| 579 | 
         
            +
                if 'agent_executor' not in st.session_state:
         
     | 
| 580 | 
         
            +
                    agent_executor = AgentExecutor(agent=st.session_state.agent, tools=st.session_state.tools, verbose=True, handle_parsing_errors=True, callbacks=CallbackManager([file_handler]))
         
     | 
| 581 | 
         
            +
                    st.session_state.agent_executor = agent_executor
         
     | 
| 582 | 
         
            +
             
     | 
| 583 | 
         
            +
                answer = st.session_state.agent_executor.invoke({"input": query,})
         
     | 
| 584 | 
         
            +
                return answer
         
     | 
| 585 | 
         
            +
             
     | 
| 586 | 
         
            +
            regular_prompt = """You are an expert astronomer and cosmologist.
         
     | 
| 587 | 
         
            +
            Answer the following question as best you can using information from the library, but speaking in a concise and factual manner.
         
     | 
| 588 | 
         
            +
            If you can not come up with an answer, say you do not know.
         
     | 
| 589 | 
         
            +
            Try to break the question down into smaller steps and solve it in a logical manner.
         
     | 
| 590 | 
         
            +
             
     | 
| 591 | 
         
            +
            Provide information about how you arrived at the answer, and any nuances or uncertainties the reader should be aware of.
         
     | 
| 592 | 
         
            +
             
     | 
| 593 | 
         
            +
            Begin! Remember to speak in a pedagogical and factual manner."
         
     | 
| 594 | 
         
            +
             
     | 
| 595 | 
         
            +
            Relevant documents:{context}
         
     | 
| 596 | 
         
            +
             
     | 
| 597 | 
         
            +
            Question: {question}
         
     | 
| 598 | 
         
            +
            Answer:"""
         
     | 
| 599 | 
         
            +
             
     | 
| 600 | 
         
            +
            bibliometric_prompt = """You are an AI assistant with expertise in astronomy and astrophysics literature. Your task is to assist with relevant bibliometric information in response to a user question. The user question may consist of identifying key papers, authors, or trends in a specific area of astronomical research.
         
     | 
| 601 | 
         
            +
             
     | 
| 602 | 
         
            +
            Depending on what the user wants, direct them to consult the NASA Astrophysics Data System (ADS) at https://ui.adsabs.harvard.edu/. Provide them with the recommended ADS query depending on their question.
         
     | 
| 603 | 
         
            +
             
     | 
| 604 | 
         
            +
            Here's a more detailed guide on how to use NASA ADS for various types of queries:
         
     | 
| 605 | 
         
            +
             
     | 
| 606 | 
         
            +
            Basic topic search: Enter keywords in the search bar, e.g., "exoplanets". Use quotation marks for exact phrases, e.g., "dark energy”
         
     | 
| 607 | 
         
            +
            Author search: Use the syntax author:"Last Name, First Name", e.g., author:"Hawking, S". For papers by multiple authors, use AND, e.g., author:"Hawking, S" AND author:"Ellis, G"
         
     | 
| 608 | 
         
            +
            Date range: Use year:YYYY-YYYY, e.g., year:2010-2020. For papers since a certain year, use year:YYYY-, e.g., year:2015-
         
     | 
| 609 | 
         
            +
            4.Combining search terms: Use AND, OR, NOT operators, e.g., "black holes" AND (author:"Hawking, S" OR author:"Penrose, R")
         
     | 
| 610 | 
         
            +
            Filtering results: Use the left sidebar to filter by publication year, article type, or astronomy database
         
     | 
| 611 | 
         
            +
            Sorting results: Use the "Sort" dropdown menu to order by options like citation count, publication date, or relevance
         
     | 
| 612 | 
         
            +
            Advanced searches: Click on the "Search" dropdown menu and select "Classic Form" for field-specific searchesUse bibcode:YYYY for a specific journal/year, e.g., bibcode:2020ApJ to find all Astrophysical Journal papers from 2020
         
     | 
| 613 | 
         
            +
            Finding review articles: Wrap the query in the reviews() operator (e.g. reviews(“dark energy”)) 
         
     | 
| 614 | 
         
            +
            Excluding preprints: Add NOT doctype:"eprint" to your search
         
     | 
| 615 | 
         
            +
            Citation metrics: Click on the citation count of a paper to see its citation history and who has cited it
         
     | 
| 616 | 
         
            +
             
     | 
| 617 | 
         
            +
            Some examples:
         
     | 
| 618 | 
         
            +
             
     | 
| 619 | 
         
            +
            Example 1:
         
     | 
| 620 | 
         
            +
            “How many papers published in 2022 used data from MAST missions?”
         
     | 
| 621 | 
         
            +
            Your response should be: year:2022  data:"MAST"
         
     | 
| 622 | 
         
            +
             
     | 
| 623 | 
         
            +
            Example 2:
         
     | 
| 624 | 
         
            +
            “What are the most cited papers on spiral galaxy halos measured in X-rays, with publication date from 2010 to 2023?
         
     | 
| 625 | 
         
            +
            Your response should be: "spiral galaxy halos" AND "x-ray" year:2010-2024
         
     | 
| 626 | 
         
            +
             
     | 
| 627 | 
         
            +
            Example 3:
         
     | 
| 628 | 
         
            +
            “Can you list 3 papers published by “< name>” as first author?”
         
     | 
| 629 | 
         
            +
            Your response should be: author: “^X”
         
     | 
| 630 | 
         
            +
             
     | 
| 631 | 
         
            +
            Example 4:
         
     | 
| 632 | 
         
            +
            “Based on papers with “<name>” as an author or co-author, can you suggest the five most recent astro-ph papers that would be relevant?”
         
     | 
| 633 | 
         
            +
            Your response should be: 
         
     | 
| 634 | 
         
            +
             
     | 
| 635 | 
         
            +
            Remember to advise users that while these examples cover many common scenarios, NASA ADS has many more advanced features that can be explored through its documentation.
         
     | 
| 636 | 
         
            +
             
     | 
| 637 | 
         
            +
            Relevant documents:{context}
         
     | 
| 638 | 
         
            +
            Question: {question}
         
     | 
| 639 | 
         
            +
             
     | 
| 640 | 
         
            +
            Response:"""
         
     | 
| 641 | 
         
            +
             
     | 
| 642 | 
         
            +
            single_paper_prompt = """You are an astronomer with access to a vast database of astronomical facts and figures. Your task is to provide a concise, accurate answer to the following specific factual question about astronomy or astrophysics.
         
     | 
| 643 | 
         
            +
            Provide the requested information clearly and directly. If relevant, include the source of your information or any recent updates to this fact. If there's any uncertainty or variation in the accepted value, briefly explain why.
         
     | 
| 644 | 
         
            +
            If the question can't be answered with a single fact, provide a short, focused explanation. Always prioritize accuracy over speculation.
         
     | 
| 645 | 
         
            +
            Relevant documents:{context}
         
     | 
| 646 | 
         
            +
            Question: {question}
         
     | 
| 647 | 
         
            +
            Response:"""
         
     | 
| 648 | 
         
            +
             
     | 
| 649 | 
         
            +
            deep_knowledge_prompt = """You are an expert astronomer with deep knowledge across various subfields of astronomy and astrophysics. Your task is to provide a comprehensive and nuanced answer to the following question, which involves an unresolved topic or requires broad, common-sense understanding.
         
     | 
| 650 | 
         
            +
            Consider multiple perspectives and current debates in the field. Explain any uncertainties or ongoing research. If relevant, mention how this topic connects to other areas of astronomy.
         
     | 
| 651 | 
         
            +
            Provide your response in a clear, pedagogical manner, breaking down complex concepts for easier understanding. If appropriate, suggest areas where further research might be needed.
         
     | 
| 652 | 
         
            +
            After formulating your initial response, take a moment to reflect on your answer. Consider:
         
     | 
| 653 | 
         
            +
            1. Have you addressed all aspects of the question?
         
     | 
| 654 | 
         
            +
            2. Are there any potential biases or assumptions in your explanation?
         
     | 
| 655 | 
         
            +
            3. Is your explanation clear and accessible to someone with a general science background?
         
     | 
| 656 | 
         
            +
            4. Have you adequately conveyed the uncertainties or debates surrounding this topic?
         
     | 
| 657 | 
         
            +
            Based on this reflection, refine your answer as needed.
         
     | 
| 658 | 
         
            +
            Remember, while you have extensive knowledge, it's okay to acknowledge the limits of current scientific understanding. If parts of the question cannot be answered definitively, explain why.
         
     | 
| 659 | 
         
            +
            Relevant documents:{context}
         
     | 
| 660 | 
         
            +
             
     | 
| 661 | 
         
            +
            Question: {question}
         
     | 
| 662 | 
         
            +
             
     | 
| 663 | 
         
            +
            Initial Response:
         
     | 
| 664 | 
         
            +
            [Your initial response here]
         
     | 
| 665 | 
         
            +
             
     | 
| 666 | 
         
            +
            Reflection and Refinement:
         
     | 
| 667 | 
         
            +
            [Your reflections and any refinements to your answer here]
         
     | 
| 668 | 
         
            +
             
     | 
| 669 | 
         
            +
            Final Response:
         
     | 
| 670 | 
         
            +
            [Your final, refined answer here]"""
         
     | 
| 671 | 
         
            +
             
     | 
| 672 | 
         
            +
            def make_rag_qa_answer(query, top_k = 10):
         
     | 
| 673 | 
         
            +
             
     | 
| 674 | 
         
            +
             
     | 
| 675 | 
         
            +
                # try:
         
     | 
| 676 | 
         
            +
                    absts, fhdrs, rs = Library2(query, top_k = top_k)
         
     | 
| 677 | 
         
            +
                
         
     | 
| 678 | 
         
            +
                    temp_abst = ''
         
     | 
| 679 | 
         
            +
                    loaders = []
         
     | 
| 680 | 
         
            +
                    for i in range(len(absts)):
         
     | 
| 681 | 
         
            +
                        temp_abst = absts[i]
         
     | 
| 682 | 
         
            +
                
         
     | 
| 683 | 
         
            +
                        try:
         
     | 
| 684 | 
         
            +
                            text_file = open("absts/"+fhdrs[i]+".txt", "w")
         
     | 
| 685 | 
         
            +
                        except:
         
     | 
| 686 | 
         
            +
                            os.mkdir('absts')
         
     | 
| 687 | 
         
            +
                            text_file = open("absts/"+fhdrs[i]+".txt", "w")
         
     | 
| 688 | 
         
            +
                        n = text_file.write(temp_abst)
         
     | 
| 689 | 
         
            +
                        text_file.close()
         
     | 
| 690 | 
         
            +
                        loader = TextLoader("absts/"+fhdrs[i]+".txt")
         
     | 
| 691 | 
         
            +
                        loaders.append(loader)
         
     | 
| 692 | 
         
            +
                
         
     | 
| 693 | 
         
            +
                    text_splitter = RecursiveCharacterTextSplitter(chunk_size=150, chunk_overlap=50, add_start_index=True)
         
     | 
| 694 | 
         
            +
                
         
     | 
| 695 | 
         
            +
                    splits = text_splitter.split_documents([loader.load()[0] for loader in loaders])
         
     | 
| 696 | 
         
            +
                    vectorstore = Chroma.from_documents(documents=splits, embedding=st.session_state.embeddings, collection_name='retdoc4')
         
     | 
| 697 | 
         
            +
                    # retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6,  "fetch_k": len(splits)})
         
     | 
| 698 | 
         
            +
                    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})
         
     | 
| 699 | 
         
            +
                
         
     | 
| 700 | 
         
            +
                    for i in range(len(absts)):
         
     | 
| 701 | 
         
            +
                        os.remove("absts/"+fhdrs[i]+".txt")
         
     | 
| 702 | 
         
            +
             
     | 
| 703 | 
         
            +
                    if st.session_state.question_type == 'Bibliometric':
         
     | 
| 704 | 
         
            +
                        template = bibliometric_prompt
         
     | 
| 705 | 
         
            +
                    elif st.session_state.question_type == 'Single-paper':
         
     | 
| 706 | 
         
            +
                        template = single_paper_prompt
         
     | 
| 707 | 
         
            +
                    elif st.session_state.question_type == 'Broad but nuanced':
         
     | 
| 708 | 
         
            +
                        template = deep_knowledge_prompt
         
     | 
| 709 | 
         
            +
                    else:
         
     | 
| 710 | 
         
            +
                        template = regular_prompt
         
     | 
| 711 | 
         
            +
                    prompt = PromptTemplate.from_template(template)
         
     | 
| 712 | 
         
            +
                
         
     | 
| 713 | 
         
            +
                    def format_docs(docs):
         
     | 
| 714 | 
         
            +
                        return "\n\n".join(doc.page_content for doc in docs)
         
     | 
| 715 | 
         
            +
                
         
     | 
| 716 | 
         
            +
                
         
     | 
| 717 | 
         
            +
                    rag_chain_from_docs = (
         
     | 
| 718 | 
         
            +
                        RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
         
     | 
| 719 | 
         
            +
                        | prompt
         
     | 
| 720 | 
         
            +
                        | st.session_state.gen_llm
         
     | 
| 721 | 
         
            +
                        | StrOutputParser()
         
     | 
| 722 | 
         
            +
                    )
         
     | 
| 723 | 
         
            +
                
         
     | 
| 724 | 
         
            +
                    rag_chain_with_source = RunnableParallel(
         
     | 
| 725 | 
         
            +
                        {"context": retriever, "question": RunnablePassthrough()}
         
     | 
| 726 | 
         
            +
                    ).assign(answer=rag_chain_from_docs)
         
     | 
| 727 | 
         
            +
                
         
     | 
| 728 | 
         
            +
                    rag_answer = rag_chain_with_source.invoke(query, )
         
     | 
| 729 | 
         
            +
                
         
     | 
| 730 | 
         
            +
                    vectorstore.delete_collection()
         
     | 
| 731 | 
         
            +
             
     | 
| 732 | 
         
            +
                # except:
         
     | 
| 733 | 
         
            +
                #     st.write('heavy load! please wait 10 seconds and try again.')
         
     | 
| 734 | 
         
            +
                
         
     | 
| 735 | 
         
            +
                    return rag_answer, rs
         
     | 
| 736 | 
         
            +
             
     | 
| 737 | 
         
            +
            def guess_question_type(query: str):
         
     | 
| 738 | 
         
            +
                categorization_prompt = """You are an expert astrophysicist and computer scientist specializing in linguistics and semantics. Your task is to categorize a given query into one of the following categories:
         
     | 
| 739 | 
         
            +
             
     | 
| 740 | 
         
            +
                1. Summarization
         
     | 
| 741 | 
         
            +
                2. Single-paper factual
         
     | 
| 742 | 
         
            +
                3. Multi-paper factual
         
     | 
| 743 | 
         
            +
                4. Named entity recognition
         
     | 
| 744 | 
         
            +
                5. Jargon-specific questions / overloaded words
         
     | 
| 745 | 
         
            +
                6. Time-sensitive
         
     | 
| 746 | 
         
            +
                7. Consensus evaluation
         
     | 
| 747 | 
         
            +
                8. What-ifs and counterfactuals
         
     | 
| 748 | 
         
            +
                9. Compositional
         
     | 
| 749 | 
         
            +
             
     | 
| 750 | 
         
            +
                Analyze the query carefully, considering its content, structure, and implications. Then, determine which of the above categories best fits the query.
         
     | 
| 751 | 
         
            +
             
     | 
| 752 | 
         
            +
                In your analysis, consider the following:
         
     | 
| 753 | 
         
            +
                - Does the query ask for a well-known datapoint or mechanism?
         
     | 
| 754 | 
         
            +
                - Can it be answered by a single paper or does it require multiple sources?
         
     | 
| 755 | 
         
            +
                - Does it involve proper nouns or specific scientific terms?
         
     | 
| 756 | 
         
            +
                - Is it time-dependent or likely to change in the near future?
         
     | 
| 757 | 
         
            +
                - Does it require evaluating consensus across multiple sources?
         
     | 
| 758 | 
         
            +
                - Is it a hypothetical or counterfactual question?
         
     | 
| 759 | 
         
            +
                - Does it need to be broken down into sub-queries (i.e. compositional)?
         
     | 
| 760 | 
         
            +
             
     | 
| 761 | 
         
            +
                After your analysis, categorize the query into one of the nine categories listed above.
         
     | 
| 762 | 
         
            +
             
     | 
| 763 | 
         
            +
                Provide a brief explanation for your categorization, highlighting the key aspects of the query that led to your decision.
         
     | 
| 764 | 
         
            +
             
     | 
| 765 | 
         
            +
                Present your final answer in the following format:
         
     | 
| 766 | 
         
            +
             
     | 
| 767 | 
         
            +
                <categorization>
         
     | 
| 768 | 
         
            +
                Category: [Selected category]
         
     | 
| 769 | 
         
            +
                Explanation: [Your explanation for the categorization]
         
     | 
| 770 | 
         
            +
                </categorization>"""
         
     | 
| 771 | 
         
            +
                # st.write('invoking hyde generation')
         
     | 
| 772 | 
         
            +
             
     | 
| 773 | 
         
            +
                # message = self.generation_client.messages.create(
         
     | 
| 774 | 
         
            +
                #         model = self.generation_model,
         
     | 
| 775 | 
         
            +
                #         max_tokens = self.max_doclen,
         
     | 
| 776 | 
         
            +
                #         temperature = self.temperature,
         
     | 
| 777 | 
         
            +
                #         system = prompt,
         
     | 
| 778 | 
         
            +
                #         messages=[{ "role": "user",
         
     | 
| 779 | 
         
            +
                #                 "content": [{"type": "text", "text": query,}] }]
         
     | 
| 780 | 
         
            +
                #     )
         
     | 
| 781 | 
         
            +
                # return message.content[0].text
         
     | 
| 782 | 
         
            +
             
     | 
| 783 | 
         
            +
                messages = [("system",categorization_prompt,),("human", query),]
         
     | 
| 784 | 
         
            +
                return st.session_state.ec.generation_client.invoke(messages).content
         
     | 
| 785 | 
         
            +
             
     | 
| 786 | 
         
            +
            class OverallConsensusEvaluation(BaseModel):
         
     | 
| 787 | 
         
            +
                consensus: Literal["Strong Agreement", "Moderate Agreement", "Weak Agreement", "No Clear Consensus", "Weak Disagreement", "Moderate Disagreement", "Strong Disagreement"] = Field(
         
     | 
| 788 | 
         
            +
                    ...,
         
     | 
| 789 | 
         
            +
                    description="The overall level of consensus between the query and the abstracts"
         
     | 
| 790 | 
         
            +
                )
         
     | 
| 791 | 
         
            +
                explanation: str = Field(
         
     | 
| 792 | 
         
            +
                    ...,
         
     | 
| 793 | 
         
            +
                    description="A detailed explanation of the consensus evaluation"
         
     | 
| 794 | 
         
            +
                )
         
     | 
| 795 | 
         
            +
                relevance_score: float = Field(
         
     | 
| 796 | 
         
            +
                    ...,
         
     | 
| 797 | 
         
            +
                    description="A score from 0 to 1 indicating how relevant the abstracts are to the query overall",
         
     | 
| 798 | 
         
            +
                    ge=0,
         
     | 
| 799 | 
         
            +
                    le=1
         
     | 
| 800 | 
         
            +
                )
         
     | 
| 801 | 
         
            +
             
     | 
| 802 | 
         
            +
            def evaluate_overall_consensus(query: str, abstracts: List[str]) -> OverallConsensusEvaluation:
         
     | 
| 803 | 
         
            +
                """
         
     | 
| 804 | 
         
            +
                Evaluates the overall consensus of the abstracts in relation to the query in a single LLM call.
         
     | 
| 805 | 
         
            +
                """
         
     | 
| 806 | 
         
            +
                prompt = f"""
         
     | 
| 807 | 
         
            +
                Query: {query}
         
     | 
| 808 | 
         
            +
             
     | 
| 809 | 
         
            +
                You will be provided with {len(abstracts)} scientific abstracts. Your task is to:
         
     | 
| 810 | 
         
            +
                1. Evaluate the overall consensus between the query and the abstracts.
         
     | 
| 811 | 
         
            +
                2. Provide a detailed explanation of your consensus evaluation.
         
     | 
| 812 | 
         
            +
                3. Assign an overall relevance score from 0 to 1, where 0 means completely irrelevant and 1 means highly relevant.
         
     | 
| 813 | 
         
            +
             
     | 
| 814 | 
         
            +
                For the consensus evaluation, use one of the following levels:
         
     | 
| 815 | 
         
            +
                Strong Agreement, Moderate Agreement, Weak Agreement, No Clear Consensus, Weak Disagreement, Moderate Disagreement, Strong Disagreement
         
     | 
| 816 | 
         
            +
             
     | 
| 817 | 
         
            +
                Here are the abstracts:
         
     | 
| 818 | 
         
            +
             
     | 
| 819 | 
         
            +
                {' '.join([f"Abstract {i+1}: {abstract}" for i, abstract in enumerate(abstracts)])}
         
     | 
| 820 | 
         
            +
             
     | 
| 821 | 
         
            +
                Provide your evaluation in a structured format.
         
     | 
| 822 | 
         
            +
                """
         
     | 
| 823 | 
         
            +
             
     | 
| 824 | 
         
            +
                response = st.session_state.consensus_client.chat.completions.create(
         
     | 
| 825 | 
         
            +
                    model="gpt-4o-mini", # used to be "gpt-4",
         
     | 
| 826 | 
         
            +
                    response_model=OverallConsensusEvaluation,
         
     | 
| 827 | 
         
            +
                    messages=[
         
     | 
| 828 | 
         
            +
                        {"role": "system", "content": """You are an assistant with expertise in astrophysics for question-answering tasks.
         
     | 
| 829 | 
         
            +
                        Evaluate the overall consensus of the retrieved scientific abstracts in relation to a given query.
         
     | 
| 830 | 
         
            +
                        If you don't know the answer, just say that you don't know.
         
     | 
| 831 | 
         
            +
                        Use six sentences maximum and keep the answer concise."""},
         
     | 
| 832 | 
         
            +
                        {"role": "user", "content": prompt}
         
     | 
| 833 | 
         
            +
                    ],
         
     | 
| 834 | 
         
            +
                    temperature=0
         
     | 
| 835 | 
         
            +
                )
         
     | 
| 836 | 
         
            +
             
     | 
| 837 | 
         
            +
                return response
         
     | 
| 838 | 
         
            +
             
     | 
| 839 | 
         
            +
            def create_embedding_plot(rs):
         
     | 
| 840 | 
         
             
                """
         
     | 
| 841 | 
         
            +
                function to create embedding plot
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 842 | 
         
             
                """
         
     | 
| 
         | 
|
| 843 | 
         | 
| 844 | 
         
            +
                pltsource = ColumnDataSource(data=dict(
         
     | 
| 845 | 
         
            +
                    x=st.session_state.umap_x,
         
     | 
| 846 | 
         
            +
                    y=st.session_state.umap_y,
         
     | 
| 847 | 
         
            +
                    title=st.session_state.titles,
         
     | 
| 848 | 
         
            +
                    link=st.session_state.bibcode,
         
     | 
| 849 | 
         
            +
                ))
         
     | 
| 850 | 
         
            +
             
     | 
| 851 | 
         
            +
                rsflag = np.zeros((len(st.session_state.ids),))
         
     | 
| 852 | 
         
            +
                rsflag[np.array([k for k in rs])] = 1
         
     | 
| 853 | 
         | 
| 854 | 
         
            +
                # outflag = np.zeros((len(st.session_state.ids),))
         
     | 
| 855 | 
         
            +
                # outflag[np.array([k for k in find_outliers(rs)])] = 1
         
     | 
| 856 | 
         
            +
                pltsource.data['colors'] = rsflag * 0.8 + 0.1
         
     | 
| 857 | 
         
            +
                # pltsource.data['colors'][outflag] = 0.5
         
     | 
| 858 | 
         
            +
                pltsource.data['sizes'] = (rsflag + 1)**5 / 100
         
     | 
| 859 | 
         
            +
             
     | 
| 860 | 
         
            +
                TOOLTIPS = """
         
     | 
| 861 | 
         
            +
                <div style="width:300px;">
         
     | 
| 862 | 
         
            +
                ID: $index
         
     | 
| 863 | 
         
            +
                ($x, $y)
         
     | 
| 864 | 
         
            +
                @title <br>
         
     | 
| 865 | 
         
            +
                @link <br> <br>
         
     | 
| 866 | 
         
            +
                </div>
         
     | 
| 867 | 
         
             
                """
         
     | 
| 868 | 
         
            +
             
     | 
| 869 | 
         
            +
                mapper = linear_cmap(field_name="colors", palette=Spectral5, low=0., high=1.)
         
     | 
| 870 | 
         
            +
             
     | 
| 871 | 
         
            +
                p = figure(width=700, height=900, tooltips=TOOLTIPS, x_range=(0, 20), y_range=(-4.2,18),
         
     | 
| 872 | 
         
            +
                        title="UMAP projection of embeddings for the astro-ph corpus")
         
     | 
| 873 | 
         
            +
             
     | 
| 874 | 
         
            +
                p.axis.visible=False
         
     | 
| 875 | 
         
            +
                p.grid.visible=False
         
     | 
| 876 | 
         
            +
                p.outline_line_alpha = 0.
         
     | 
| 877 | 
         
            +
             
     | 
| 878 | 
         
            +
                p.circle('x', 'y', radius='sizes', source=pltsource, alpha=0.3, fill_color=mapper, fill_alpha='colors', line_color="lightgrey",line_alpha=0.1)
         
     | 
| 879 | 
         
            +
             
     | 
| 880 | 
         
            +
                return p
         
     | 
| 881 | 
         
            +
             
     | 
| 882 | 
         
            +
            if submit_button:
         
     | 
| 883 | 
         
            +
             
     | 
| 884 | 
         
            +
                keywords = [kw.strip() for kw in extra_keywords.split(',')] if extra_keywords else []
         
     | 
| 885 | 
         
            +
                toggles = {'Keyword weighting': toggle_a, 'Time weighting': toggle_b, 'Citation weighting': toggle_c}
         
     | 
| 886 | 
         
            +
                
         
     | 
| 887 | 
         
            +
                if (method == "Semantic search"):
         
     | 
| 888 | 
         
            +
                    with st.spinner('set retrieval method to'+ method):
         
     | 
| 889 | 
         
            +
                        st.session_state.ec = EmbeddingRetrievalSystem()
         
     | 
| 890 | 
         
            +
                elif (method == "Semantic search + HyDE"):
         
     | 
| 891 | 
         
            +
                    with st.spinner('set retrieval method to'+ method):
         
     | 
| 892 | 
         
            +
                        st.session_state.ec = HydeRetrievalSystem()
         
     | 
| 893 | 
         
            +
                elif (method == "Semantic search + HyDE + CoHERE"):
         
     | 
| 894 | 
         
            +
                    with st.spinner('set retrieval method to'+ method):
         
     | 
| 895 | 
         
            +
                        st.session_state.ec = HydeCohereRetrievalSystem()
         
     | 
| 896 | 
         
            +
                st.toast('loaded retrieval system')
         
     | 
| 897 | 
         
            +
                
         
     | 
| 898 | 
         
            +
                with st.spinner(search_text_list[np.random.choice(len(search_text_list))]):
         
     | 
| 899 | 
         
            +
             
     | 
| 900 | 
         
            +
                    st.session_state.ec.query_input_keywords = keywords
         
     | 
| 901 | 
         
            +
                    st.session_state.ec.toggles = toggles
         
     | 
| 902 | 
         
            +
                    st.session_state.ec.question_type = question_type
         
     | 
| 903 | 
         
            +
                    st.session_state.ec.rag_method = method
         
     | 
| 904 | 
         
            +
                    st.session_state.ec.gen_method = method2
         
     | 
| 905 | 
         
            +
                    
         
     | 
| 906 | 
         
            +
                    if method2 == "Basic RAG":
         
     | 
| 907 | 
         
            +
                        st.session_state.gen_method = 'rag'
         
     | 
| 908 | 
         
            +
                    elif method2 == "ReAct Agent":
         
     | 
| 909 | 
         
            +
                        st.session_state.gen_method = 'agent'
         
     | 
| 910 | 
         
            +
             
     | 
| 911 | 
         
            +
                    if st.session_state.gen_method == 'agent':
         
     | 
| 912 | 
         
            +
                        answer = run_agent_qa(query, top_k)
         
     | 
| 913 | 
         
            +
                        rs = get_topk(query, top_k)
         
     | 
| 914 | 
         
            +
             
     | 
| 915 | 
         
            +
                        answer_text = answer['output']
         
     | 
| 916 | 
         
            +
                        st.write(answer_text)
         
     | 
| 917 | 
         
            +
             
     | 
| 918 | 
         
            +
                        file_path = "agent_trace.txt"
         
     | 
| 919 | 
         
            +
                        with open(file_path, 'r') as file:
         
     | 
| 920 | 
         
            +
                            intermediate_steps = file.read()
         
     | 
| 921 | 
         
            +
                        st.expander('Intermediate steps', expanded=False).write(intermediate_steps)
         
     | 
| 922 | 
         
            +
             
     | 
| 923 | 
         
            +
                    elif st.session_state.gen_method == 'rag':
         
     | 
| 924 | 
         
            +
                        answer, rs = make_rag_qa_answer(query, top_k)
         
     | 
| 925 | 
         
            +
                        answer_text = answer['answer']
         
     | 
| 926 | 
         
            +
                        st.write(answer_text)
         
     | 
| 927 | 
         
            +
             
     | 
| 928 | 
         
            +
                    triggered_keywords = st.session_state.ec.query_kws
         
     | 
| 929 | 
         
            +
             
     | 
| 930 | 
         
            +
                    with st.spinner('compiling top-k papers'+ method):
         
     | 
| 931 | 
         
            +
                        papers_df = get_paper_df(rs)
         
     | 
| 932 | 
         
            +
                        
         
     | 
| 933 | 
         
            +
                        with st.expander("Relevant papers", expanded=True):
         
     | 
| 934 | 
         
            +
                            # st.dataframe(papers_df, hide_index=True)
         
     | 
| 935 | 
         
            +
                            st.data_editor(papers_df, column_config = {'ADS Link':st.column_config.LinkColumn(display_text= 'https://ui.adsabs.harvard.edu/abs/(.*?)/abstract')})
         
     | 
| 936 | 
         
            +
             
     | 
| 937 | 
         
            +
                    st.write('**Triggered keywords:** `'+ "`, `".join(triggered_keywords)+'`')
         
     | 
| 938 | 
         
            +
                
         
     | 
| 939 | 
         
            +
                col1, col2 = st.columns(2)
         
     | 
| 940 | 
         
            +
             
     | 
| 941 | 
         
            +
                with col1:
         
     | 
| 942 | 
         
            +
                    with st.expander("Evaluating question type", expanded=True):
         
     | 
| 943 | 
         
            +
                        st.subheader("Question type suggestion")
         
     | 
| 944 | 
         
            +
                        question_type_gen = guess_question_type(query)
         
     | 
| 945 | 
         
            +
                        if '<categorization>' in question_type_gen:
         
     | 
| 946 | 
         
            +
                            question_type_gen = question_type_gen.split('<categorization>')[1]
         
     | 
| 947 | 
         
            +
                        if '</categorization>' in question_type_gen:
         
     | 
| 948 | 
         
            +
                            question_type_gen = question_type_gen.split('</categorization>')[0]
         
     | 
| 949 | 
         
            +
                        question_type_gen = question_type_gen.replace('\n','  \n')
         
     | 
| 950 | 
         
            +
                        st.markdown(question_type_gen)
         
     | 
| 951 | 
         
            +
             
     | 
| 952 | 
         
            +
                with col2:
         
     | 
| 953 | 
         
            +
                    with st.expander("Evaluating abstract consensus", expanded=True):
         
     | 
| 954 | 
         
            +
                        consensus_answer = evaluate_overall_consensus(query, [st.session_state.abstracts[i] for i in rs])
         
     | 
| 955 | 
         
            +
                        st.subheader("Consensus: "+consensus_answer.consensus)
         
     | 
| 956 | 
         
            +
                        st.markdown(consensus_answer.explanation)
         
     | 
| 957 | 
         
            +
                        st.markdown('Relevance of retrieved papers to answer: %.1f' %consensus_answer.relevance_score)
         
     | 
| 958 | 
         
            +
                            
         
     | 
| 959 | 
         
            +
                session_vars = {
         
     | 
| 960 | 
         
            +
                    "runtime": "pathfinder_v1_online",
         
     | 
| 961 | 
         
            +
                    "query": query,
         
     | 
| 962 | 
         
            +
                    "question_type": question_type,
         
     | 
| 963 | 
         
            +
                    'Keyword weighting': toggle_a, 
         
     | 
| 964 | 
         
            +
                    'Time weighting': toggle_b, 
         
     | 
| 965 | 
         
            +
                    'Citation weighting': toggle_c,
         
     | 
| 966 | 
         
            +
                    "rag_method" : method,
         
     | 
| 967 | 
         
            +
                    "gen_method" : method2,
         
     | 
| 968 | 
         
            +
                    "answer" : answer_text,
         
     | 
| 969 | 
         
            +
                    "topk" : ['%.0f' %i for i in rs],
         
     | 
| 970 | 
         
            +
                    "topk_scores" : ['%.6f' %rs[i] for i in rs], 
         
     | 
| 971 | 
         
            +
                    "topk_papers": list(papers_df['ADS Link']),
         
     | 
| 972 | 
         
            +
                }
         
     | 
| 973 | 
         
            +
                
         
     | 
| 974 | 
         
            +
                @st.fragment()
         
     | 
| 975 | 
         
            +
                def download_op(data):
         
     | 
| 976 | 
         
            +
                    json_string = json.dumps(data)
         
     | 
| 977 | 
         
            +
                    st.download_button(
         
     | 
| 978 | 
         
            +
                        label='Download output',
         
     | 
| 979 | 
         
            +
                        file_name="pathfinder_data.json",
         
     | 
| 980 | 
         
            +
                        mime="application/json",
         
     | 
| 981 | 
         
            +
                        data=json_string,)
         
     | 
| 982 | 
         
            +
                    
         
     | 
| 983 | 
         
            +
                with st.sidebar:
         
     | 
| 984 | 
         
            +
                    download_op(session_vars)
         
     | 
| 985 | 
         
            +
                    
         
     | 
| 986 | 
         
            +
                embedding_plot = create_embedding_plot(rs)
         
     | 
| 987 | 
         
            +
                st.bokeh_chart(embedding_plot)
         
     | 
| 988 | 
         
            +
                
         
     | 
| 989 | 
         
            +
            else:
         
     | 
| 990 | 
         
            +
                st.info("Use the sidebar to tweak the search parameters to get better results.")
         
     | 
    	
        local_files/astro_ph_ga_feeds_ada_embedding_27-Jun-2023.pkl → data/data-00000-of-00012.arrow
    RENAMED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
            -
            size  
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:b4bbd812e330ce29cf46abbb701ff70b5c25047753922fcc6dd347cd96944ca7
         
     | 
| 3 | 
         
            +
            size 481016544
         
     | 
    	
        local_files/astro_ph_ga_embedding_16-Jun-2024.pkl → data/data-00001-of-00012.arrow
    RENAMED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
            -
            size  
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:8b95552c0290335c946c519766b99b356425672d692dfb550789cb10e574fb63
         
     | 
| 3 | 
         
            +
            size 475735248
         
     | 
    	
        local_files/astro_ph_ga_embedding_27-Jun-2023.pkl → data/data-00002-of-00012.arrow
    RENAMED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
            -
            size  
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:ea8ca6e7ac9b5c7e7fc4ffeb7b304d6cf48fbd97ee0e43ffff4291360a96ecdb
         
     | 
| 3 | 
         
            +
            size 477037032
         
     | 
    	
        local_files/astro_ph_ga_feeds_ada_embedding_16-Jun-2024.pkl → data/data-00003-of-00012.arrow
    RENAMED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
            -
            size  
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:6b38c307368b75a5352aeb678b105d8349ed64c0bf42846d9d8c05b88f3f86ee
         
     | 
| 3 | 
         
            +
            size 480337696
         
     | 
    	
        data/data-00004-of-00012.arrow
    ADDED
    
    | 
         @@ -0,0 +1,3 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:77e6a294bc709aaa1fa103903f372516a86e303a9de36471c930af9d3d45ed81
         
     | 
| 3 | 
         
            +
            size 475570280
         
     | 
    	
        data/data-00005-of-00012.arrow
    ADDED
    
    | 
         @@ -0,0 +1,3 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:abe6acc1a93e5c445a1b790d5e6f81032d57d28fae840e1672b013b7e00b6ebc
         
     | 
| 3 | 
         
            +
            size 474685320
         
     | 
    	
        data/data-00006-of-00012.arrow
    ADDED
    
    | 
         @@ -0,0 +1,3 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:6d1186e8f83b60d78efca8ee4aaa526a4a25d1c99108befd904a27927df3721a
         
     | 
| 3 | 
         
            +
            size 452749528
         
     | 
    	
        data/data-00007-of-00012.arrow
    ADDED
    
    | 
         @@ -0,0 +1,3 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:475be1db95279824c32d170c39ae664485c911c0b499f3f287eb76cb8ffa3672
         
     | 
| 3 | 
         
            +
            size 456206336
         
     | 
    	
        data/data-00008-of-00012.arrow
    ADDED
    
    | 
         @@ -0,0 +1,3 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:8b407b33b433a6eb9d67071b605a637456ebaf0dfbb758eeab746755646448a4
         
     | 
| 3 | 
         
            +
            size 467900584
         
     | 
    	
        data/data-00009-of-00012.arrow
    ADDED
    
    | 
         @@ -0,0 +1,3 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:015f809f2197e9fe98b4bf7d851f52bbbc29bd25f2a6264d7bbac6ca0c4029d3
         
     | 
| 3 | 
         
            +
            size 479707864
         
     | 
    	
        data/data-00010-of-00012.arrow
    ADDED
    
    | 
         @@ -0,0 +1,3 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:3d41cff04251d57fd742310815c5cbc9a8527e9c57775971dd36f262d20d1d5b
         
     | 
| 3 | 
         
            +
            size 466979224
         
     | 
    	
        data/data-00011-of-00012.arrow
    ADDED
    
    | 
         @@ -0,0 +1,3 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:8d5634d0dc5cb54e6b859176c86d0bae7eec0b4a4a487e9478d2e49b780a6338
         
     | 
| 3 | 
         
            +
            size 486948696
         
     | 
    	
        data/dataset_info.json
    ADDED
    
    | 
         @@ -0,0 +1,188 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            {
         
     | 
| 2 | 
         
            +
              "builder_name": "parquet",
         
     | 
| 3 | 
         
            +
              "citation": "",
         
     | 
| 4 | 
         
            +
              "config_name": "default",
         
     | 
| 5 | 
         
            +
              "dataset_name": "pathfinder_arxiv_data",
         
     | 
| 6 | 
         
            +
              "dataset_size": 5770056875,
         
     | 
| 7 | 
         
            +
              "description": "",
         
     | 
| 8 | 
         
            +
              "download_checksums": {
         
     | 
| 9 | 
         
            +
                "hf://datasets/kiyer/pathfinder_arxiv_data@66fc52fb3d7d82779c3d73b0cb0c14218cb02e63/data/train-00000-of-00012.parquet": {
         
     | 
| 10 | 
         
            +
                  "num_bytes": 384481705,
         
     | 
| 11 | 
         
            +
                  "checksum": null
         
     | 
| 12 | 
         
            +
                },
         
     | 
| 13 | 
         
            +
                "hf://datasets/kiyer/pathfinder_arxiv_data@66fc52fb3d7d82779c3d73b0cb0c14218cb02e63/data/train-00001-of-00012.parquet": {
         
     | 
| 14 | 
         
            +
                  "num_bytes": 383347319,
         
     | 
| 15 | 
         
            +
                  "checksum": null
         
     | 
| 16 | 
         
            +
                },
         
     | 
| 17 | 
         
            +
                "hf://datasets/kiyer/pathfinder_arxiv_data@66fc52fb3d7d82779c3d73b0cb0c14218cb02e63/data/train-00002-of-00012.parquet": {
         
     | 
| 18 | 
         
            +
                  "num_bytes": 383133689,
         
     | 
| 19 | 
         
            +
                  "checksum": null
         
     | 
| 20 | 
         
            +
                },
         
     | 
| 21 | 
         
            +
                "hf://datasets/kiyer/pathfinder_arxiv_data@66fc52fb3d7d82779c3d73b0cb0c14218cb02e63/data/train-00003-of-00012.parquet": {
         
     | 
| 22 | 
         
            +
                  "num_bytes": 384399351,
         
     | 
| 23 | 
         
            +
                  "checksum": null
         
     | 
| 24 | 
         
            +
                },
         
     | 
| 25 | 
         
            +
                "hf://datasets/kiyer/pathfinder_arxiv_data@66fc52fb3d7d82779c3d73b0cb0c14218cb02e63/data/train-00004-of-00012.parquet": {
         
     | 
| 26 | 
         
            +
                  "num_bytes": 382810245,
         
     | 
| 27 | 
         
            +
                  "checksum": null
         
     | 
| 28 | 
         
            +
                },
         
     | 
| 29 | 
         
            +
                "hf://datasets/kiyer/pathfinder_arxiv_data@66fc52fb3d7d82779c3d73b0cb0c14218cb02e63/data/train-00005-of-00012.parquet": {
         
     | 
| 30 | 
         
            +
                  "num_bytes": 382870394,
         
     | 
| 31 | 
         
            +
                  "checksum": null
         
     | 
| 32 | 
         
            +
                },
         
     | 
| 33 | 
         
            +
                "hf://datasets/kiyer/pathfinder_arxiv_data@66fc52fb3d7d82779c3d73b0cb0c14218cb02e63/data/train-00006-of-00012.parquet": {
         
     | 
| 34 | 
         
            +
                  "num_bytes": 364849142,
         
     | 
| 35 | 
         
            +
                  "checksum": null
         
     | 
| 36 | 
         
            +
                },
         
     | 
| 37 | 
         
            +
                "hf://datasets/kiyer/pathfinder_arxiv_data@66fc52fb3d7d82779c3d73b0cb0c14218cb02e63/data/train-00007-of-00012.parquet": {
         
     | 
| 38 | 
         
            +
                  "num_bytes": 363965178,
         
     | 
| 39 | 
         
            +
                  "checksum": null
         
     | 
| 40 | 
         
            +
                },
         
     | 
| 41 | 
         
            +
                "hf://datasets/kiyer/pathfinder_arxiv_data@66fc52fb3d7d82779c3d73b0cb0c14218cb02e63/data/train-00008-of-00012.parquet": {
         
     | 
| 42 | 
         
            +
                  "num_bytes": 376639054,
         
     | 
| 43 | 
         
            +
                  "checksum": null
         
     | 
| 44 | 
         
            +
                },
         
     | 
| 45 | 
         
            +
                "hf://datasets/kiyer/pathfinder_arxiv_data@66fc52fb3d7d82779c3d73b0cb0c14218cb02e63/data/train-00009-of-00012.parquet": {
         
     | 
| 46 | 
         
            +
                  "num_bytes": 384035100,
         
     | 
| 47 | 
         
            +
                  "checksum": null
         
     | 
| 48 | 
         
            +
                },
         
     | 
| 49 | 
         
            +
                "hf://datasets/kiyer/pathfinder_arxiv_data@66fc52fb3d7d82779c3d73b0cb0c14218cb02e63/data/train-00010-of-00012.parquet": {
         
     | 
| 50 | 
         
            +
                  "num_bytes": 355126903,
         
     | 
| 51 | 
         
            +
                  "checksum": null
         
     | 
| 52 | 
         
            +
                },
         
     | 
| 53 | 
         
            +
                "hf://datasets/kiyer/pathfinder_arxiv_data@66fc52fb3d7d82779c3d73b0cb0c14218cb02e63/data/train-00011-of-00012.parquet": {
         
     | 
| 54 | 
         
            +
                  "num_bytes": 359912183,
         
     | 
| 55 | 
         
            +
                  "checksum": null
         
     | 
| 56 | 
         
            +
                }
         
     | 
| 57 | 
         
            +
              },
         
     | 
| 58 | 
         
            +
              "download_size": 4505570263,
         
     | 
| 59 | 
         
            +
              "features": {
         
     | 
| 60 | 
         
            +
                "ads_id": {
         
     | 
| 61 | 
         
            +
                  "dtype": "string",
         
     | 
| 62 | 
         
            +
                  "_type": "Value"
         
     | 
| 63 | 
         
            +
                },
         
     | 
| 64 | 
         
            +
                "arxiv_id": {
         
     | 
| 65 | 
         
            +
                  "dtype": "string",
         
     | 
| 66 | 
         
            +
                  "_type": "Value"
         
     | 
| 67 | 
         
            +
                },
         
     | 
| 68 | 
         
            +
                "title": {
         
     | 
| 69 | 
         
            +
                  "dtype": "string",
         
     | 
| 70 | 
         
            +
                  "_type": "Value"
         
     | 
| 71 | 
         
            +
                },
         
     | 
| 72 | 
         
            +
                "abstract": {
         
     | 
| 73 | 
         
            +
                  "dtype": "string",
         
     | 
| 74 | 
         
            +
                  "_type": "Value"
         
     | 
| 75 | 
         
            +
                },
         
     | 
| 76 | 
         
            +
                "embed": {
         
     | 
| 77 | 
         
            +
                  "feature": {
         
     | 
| 78 | 
         
            +
                    "dtype": "float32",
         
     | 
| 79 | 
         
            +
                    "_type": "Value"
         
     | 
| 80 | 
         
            +
                  },
         
     | 
| 81 | 
         
            +
                  "_type": "Sequence"
         
     | 
| 82 | 
         
            +
                },
         
     | 
| 83 | 
         
            +
                "umap_x": {
         
     | 
| 84 | 
         
            +
                  "dtype": "float32",
         
     | 
| 85 | 
         
            +
                  "_type": "Value"
         
     | 
| 86 | 
         
            +
                },
         
     | 
| 87 | 
         
            +
                "umap_y": {
         
     | 
| 88 | 
         
            +
                  "dtype": "float32",
         
     | 
| 89 | 
         
            +
                  "_type": "Value"
         
     | 
| 90 | 
         
            +
                },
         
     | 
| 91 | 
         
            +
                "date": {
         
     | 
| 92 | 
         
            +
                  "dtype": "date32",
         
     | 
| 93 | 
         
            +
                  "_type": "Value"
         
     | 
| 94 | 
         
            +
                },
         
     | 
| 95 | 
         
            +
                "cites": {
         
     | 
| 96 | 
         
            +
                  "dtype": "int64",
         
     | 
| 97 | 
         
            +
                  "_type": "Value"
         
     | 
| 98 | 
         
            +
                },
         
     | 
| 99 | 
         
            +
                "bibcode": {
         
     | 
| 100 | 
         
            +
                  "dtype": "string",
         
     | 
| 101 | 
         
            +
                  "_type": "Value"
         
     | 
| 102 | 
         
            +
                },
         
     | 
| 103 | 
         
            +
                "keywords": {
         
     | 
| 104 | 
         
            +
                  "feature": {
         
     | 
| 105 | 
         
            +
                    "dtype": "string",
         
     | 
| 106 | 
         
            +
                    "_type": "Value"
         
     | 
| 107 | 
         
            +
                  },
         
     | 
| 108 | 
         
            +
                  "_type": "Sequence"
         
     | 
| 109 | 
         
            +
                },
         
     | 
| 110 | 
         
            +
                "ads_keywords": {
         
     | 
| 111 | 
         
            +
                  "feature": {
         
     | 
| 112 | 
         
            +
                    "dtype": "string",
         
     | 
| 113 | 
         
            +
                    "_type": "Value"
         
     | 
| 114 | 
         
            +
                  },
         
     | 
| 115 | 
         
            +
                  "_type": "Sequence"
         
     | 
| 116 | 
         
            +
                },
         
     | 
| 117 | 
         
            +
                "read_count": {
         
     | 
| 118 | 
         
            +
                  "dtype": "int64",
         
     | 
| 119 | 
         
            +
                  "_type": "Value"
         
     | 
| 120 | 
         
            +
                },
         
     | 
| 121 | 
         
            +
                "doi": {
         
     | 
| 122 | 
         
            +
                  "feature": {
         
     | 
| 123 | 
         
            +
                    "dtype": "string",
         
     | 
| 124 | 
         
            +
                    "_type": "Value"
         
     | 
| 125 | 
         
            +
                  },
         
     | 
| 126 | 
         
            +
                  "_type": "Sequence"
         
     | 
| 127 | 
         
            +
                },
         
     | 
| 128 | 
         
            +
                "authors": {
         
     | 
| 129 | 
         
            +
                  "feature": {
         
     | 
| 130 | 
         
            +
                    "dtype": "string",
         
     | 
| 131 | 
         
            +
                    "_type": "Value"
         
     | 
| 132 | 
         
            +
                  },
         
     | 
| 133 | 
         
            +
                  "_type": "Sequence"
         
     | 
| 134 | 
         
            +
                },
         
     | 
| 135 | 
         
            +
                "aff": {
         
     | 
| 136 | 
         
            +
                  "feature": {
         
     | 
| 137 | 
         
            +
                    "dtype": "string",
         
     | 
| 138 | 
         
            +
                    "_type": "Value"
         
     | 
| 139 | 
         
            +
                  },
         
     | 
| 140 | 
         
            +
                  "_type": "Sequence"
         
     | 
| 141 | 
         
            +
                },
         
     | 
| 142 | 
         
            +
                "cite_bibcodes": {
         
     | 
| 143 | 
         
            +
                  "feature": {
         
     | 
| 144 | 
         
            +
                    "dtype": "string",
         
     | 
| 145 | 
         
            +
                    "_type": "Value"
         
     | 
| 146 | 
         
            +
                  },
         
     | 
| 147 | 
         
            +
                  "_type": "Sequence"
         
     | 
| 148 | 
         
            +
                },
         
     | 
| 149 | 
         
            +
                "ref_bibcodes": {
         
     | 
| 150 | 
         
            +
                  "feature": {
         
     | 
| 151 | 
         
            +
                    "dtype": "string",
         
     | 
| 152 | 
         
            +
                    "_type": "Value"
         
     | 
| 153 | 
         
            +
                  },
         
     | 
| 154 | 
         
            +
                  "_type": "Sequence"
         
     | 
| 155 | 
         
            +
                }
         
     | 
| 156 | 
         
            +
              },
         
     | 
| 157 | 
         
            +
              "homepage": "",
         
     | 
| 158 | 
         
            +
              "license": "",
         
     | 
| 159 | 
         
            +
              "size_in_bytes": 10275627138,
         
     | 
| 160 | 
         
            +
              "splits": {
         
     | 
| 161 | 
         
            +
                "train": {
         
     | 
| 162 | 
         
            +
                  "name": "train",
         
     | 
| 163 | 
         
            +
                  "num_bytes": 5770056875,
         
     | 
| 164 | 
         
            +
                  "num_examples": 499142,
         
     | 
| 165 | 
         
            +
                  "shard_lengths": [
         
     | 
| 166 | 
         
            +
                    42596,
         
     | 
| 167 | 
         
            +
                    43596,
         
     | 
| 168 | 
         
            +
                    43595,
         
     | 
| 169 | 
         
            +
                    42595,
         
     | 
| 170 | 
         
            +
                    43595,
         
     | 
| 171 | 
         
            +
                    43595,
         
     | 
| 172 | 
         
            +
                    46595,
         
     | 
| 173 | 
         
            +
                    44595,
         
     | 
| 174 | 
         
            +
                    43595,
         
     | 
| 175 | 
         
            +
                    43595,
         
     | 
| 176 | 
         
            +
                    43595,
         
     | 
| 177 | 
         
            +
                    17595
         
     | 
| 178 | 
         
            +
                  ],
         
     | 
| 179 | 
         
            +
                  "dataset_name": "pathfinder_arxiv_data"
         
     | 
| 180 | 
         
            +
                }
         
     | 
| 181 | 
         
            +
              },
         
     | 
| 182 | 
         
            +
              "version": {
         
     | 
| 183 | 
         
            +
                "version_str": "0.0.0",
         
     | 
| 184 | 
         
            +
                "major": 0,
         
     | 
| 185 | 
         
            +
                "minor": 0,
         
     | 
| 186 | 
         
            +
                "patch": 0
         
     | 
| 187 | 
         
            +
              }
         
     | 
| 188 | 
         
            +
            }
         
     | 
    	
        data/state.json
    ADDED
    
    | 
         @@ -0,0 +1,46 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            {
         
     | 
| 2 | 
         
            +
              "_data_files": [
         
     | 
| 3 | 
         
            +
                {
         
     | 
| 4 | 
         
            +
                  "filename": "data-00000-of-00012.arrow"
         
     | 
| 5 | 
         
            +
                },
         
     | 
| 6 | 
         
            +
                {
         
     | 
| 7 | 
         
            +
                  "filename": "data-00001-of-00012.arrow"
         
     | 
| 8 | 
         
            +
                },
         
     | 
| 9 | 
         
            +
                {
         
     | 
| 10 | 
         
            +
                  "filename": "data-00002-of-00012.arrow"
         
     | 
| 11 | 
         
            +
                },
         
     | 
| 12 | 
         
            +
                {
         
     | 
| 13 | 
         
            +
                  "filename": "data-00003-of-00012.arrow"
         
     | 
| 14 | 
         
            +
                },
         
     | 
| 15 | 
         
            +
                {
         
     | 
| 16 | 
         
            +
                  "filename": "data-00004-of-00012.arrow"
         
     | 
| 17 | 
         
            +
                },
         
     | 
| 18 | 
         
            +
                {
         
     | 
| 19 | 
         
            +
                  "filename": "data-00005-of-00012.arrow"
         
     | 
| 20 | 
         
            +
                },
         
     | 
| 21 | 
         
            +
                {
         
     | 
| 22 | 
         
            +
                  "filename": "data-00006-of-00012.arrow"
         
     | 
| 23 | 
         
            +
                },
         
     | 
| 24 | 
         
            +
                {
         
     | 
| 25 | 
         
            +
                  "filename": "data-00007-of-00012.arrow"
         
     | 
| 26 | 
         
            +
                },
         
     | 
| 27 | 
         
            +
                {
         
     | 
| 28 | 
         
            +
                  "filename": "data-00008-of-00012.arrow"
         
     | 
| 29 | 
         
            +
                },
         
     | 
| 30 | 
         
            +
                {
         
     | 
| 31 | 
         
            +
                  "filename": "data-00009-of-00012.arrow"
         
     | 
| 32 | 
         
            +
                },
         
     | 
| 33 | 
         
            +
                {
         
     | 
| 34 | 
         
            +
                  "filename": "data-00010-of-00012.arrow"
         
     | 
| 35 | 
         
            +
                },
         
     | 
| 36 | 
         
            +
                {
         
     | 
| 37 | 
         
            +
                  "filename": "data-00011-of-00012.arrow"
         
     | 
| 38 | 
         
            +
                }
         
     | 
| 39 | 
         
            +
              ],
         
     | 
| 40 | 
         
            +
              "_fingerprint": "10a80a75c30e04f8",
         
     | 
| 41 | 
         
            +
              "_format_columns": null,
         
     | 
| 42 | 
         
            +
              "_format_kwargs": {},
         
     | 
| 43 | 
         
            +
              "_format_type": null,
         
     | 
| 44 | 
         
            +
              "_output_all_columns": false,
         
     | 
| 45 | 
         
            +
              "_split": "train"
         
     | 
| 46 | 
         
            +
            }
         
     | 
    	
        local_files/astro_ph_ga_feeds_upto_16-Jun-2024.pkl
    DELETED
    
    | 
         @@ -1,3 +0,0 @@ 
     | 
|
| 1 | 
         
            -
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256:89114c7ff34595e424f1585d32aec5665a07f26399e75bb8b40b4de7737ac2d0
         
     | 
| 3 | 
         
            -
            size 134799303
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
    	
        local_files/astro_ph_ga_feeds_upto_27-Jun-2023.pkl
    DELETED
    
    | 
         @@ -1,3 +0,0 @@ 
     | 
|
| 1 | 
         
            -
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256:29237e0e973a5fcd4df826c09432a069e6a471d5725fdfc9a0f8c7c62b69e188
         
     | 
| 3 | 
         
            -
            size 89228171
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
    	
        pages/.ipynb_checkpoints/Untitled-checkpoint.ipynb
    DELETED
    
    | 
         @@ -1,6 +0,0 @@ 
     | 
|
| 1 | 
         
            -
            {
         
     | 
| 2 | 
         
            -
             "cells": [],
         
     | 
| 3 | 
         
            -
             "metadata": {},
         
     | 
| 4 | 
         
            -
             "nbformat": 4,
         
     | 
| 5 | 
         
            -
             "nbformat_minor": 5
         
     | 
| 6 | 
         
            -
            }
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
    	
        pages/1_arxiv_embedding_explorer.py
    DELETED
    
    | 
         @@ -1,121 +0,0 @@ 
     | 
|
| 1 | 
         
            -
            import streamlit as st
         
     | 
| 2 | 
         
            -
            import pandas as pd
         
     | 
| 3 | 
         
            -
            import numpy as np
         
     | 
| 4 | 
         
            -
            import matplotlib.pyplot as plt
         
     | 
| 5 | 
         
            -
            import pickle
         
     | 
| 6 | 
         
            -
            from bokeh.palettes import OrRd
         
     | 
| 7 | 
         
            -
            from bokeh.plotting import figure, show
         
     | 
| 8 | 
         
            -
            from bokeh.plotting import ColumnDataSource, figure, output_notebook, show
         
     | 
| 9 | 
         
            -
            import cloudpickle as cp
         
     | 
| 10 | 
         
            -
            import pickle
         
     | 
| 11 | 
         
            -
            from scipy import stats
         
     | 
| 12 | 
         
            -
            from urllib.request import urlopen
         
     | 
| 13 | 
         
            -
             
     | 
| 14 | 
         
            -
            @st.cache_data
         
     | 
| 15 | 
         
            -
            def get_feeds_data(url):
         
     | 
| 16 | 
         
            -
                # data = cp.load(urlopen(url))
         
     | 
| 17 | 
         
            -
                with open(url, "rb") as fp:
         
     | 
| 18 | 
         
            -
                    data = pickle.load(fp)
         
     | 
| 19 | 
         
            -
                st.sidebar.success("Fetched data from API!")
         
     | 
| 20 | 
         
            -
                return data
         
     | 
| 21 | 
         
            -
             
     | 
| 22 | 
         
            -
            # embeddings = OpenAIEmbeddings()
         
     | 
| 23 | 
         
            -
             
     | 
| 24 | 
         
            -
            dateval = "27-Jun-2023"
         
     | 
| 25 | 
         
            -
            feeds_link = "local_files/astro_ph_ga_feeds_upto_"+dateval+".pkl"
         
     | 
| 26 | 
         
            -
            embed_link = "local_files/astro_ph_ga_feeds_ada_embedding_"+dateval+".pkl"
         
     | 
| 27 | 
         
            -
            gal_feeds = get_feeds_data(feeds_link)
         
     | 
| 28 | 
         
            -
            arxiv_ada_embeddings = get_feeds_data(embed_link)
         
     | 
| 29 | 
         
            -
             
     | 
| 30 | 
         
            -
            @st.cache_data
         
     | 
| 31 | 
         
            -
            def get_embedding_data(url):
         
     | 
| 32 | 
         
            -
                # data = cp.load(urlopen(url))
         
     | 
| 33 | 
         
            -
                with open(url, "rb") as fp:
         
     | 
| 34 | 
         
            -
                    data = pickle.load(fp)
         
     | 
| 35 | 
         
            -
                st.sidebar.success("Fetched data from API!")
         
     | 
| 36 | 
         
            -
                return data
         
     | 
| 37 | 
         
            -
             
     | 
| 38 | 
         
            -
            url = "local_files/astro_ph_ga_embedding_"+dateval+".pkl"
         
     | 
| 39 | 
         
            -
            # e2d, _, _, _, _ = get_embedding_data(url)
         
     | 
| 40 | 
         
            -
            embedding = get_embedding_data(url)
         
     | 
| 41 | 
         
            -
             
     | 
| 42 | 
         
            -
            st.title("ArXiv+GPT3 embedding explorer")
         
     | 
| 43 | 
         
            -
            st.markdown('[Includes papers up to: `'+dateval+'`]')
         
     | 
| 44 | 
         
            -
            st.markdown("This is an explorer for astro-ph.GA papers on the arXiv (up to Apt 18th, 2023). The papers have been preprocessed with `chaotic_neural` [(link)](http://chaotic-neural.readthedocs.io/) after which the collected abstracts are run through `text-embedding-ada-002` with [langchain](https://python.langchain.com/en/latest/ecosystem/openai.html) to generate a unique vector correpsonding to each paper. These are then compressed using [umap](https://umap-learn.readthedocs.io/en/latest/) and shown here, and can be used for similarity searches with methods like [faiss](https://github.com/facebookresearch/faiss). The scatterplot here can be paired with a heatmap for more targeted searches looking at a specific topic or area (see sidebar). Upgrade to chaotic neural suggested by Jo Ciucă, thank you! More to come (hopefully) with GPT-4 and its applications!")
         
     | 
| 45 | 
         
            -
            st.markdown("Interpreting the UMAP plot: the algorithm creates a 2d embedding from the high-dim vector space that tries to conserve as much similarity information as possible. Nearby points in UMAP space are similar, and grow dissimiliar as you move farther away. The axes do not have any physical meaning.")
         
     | 
| 46 | 
         
            -
             
     | 
| 47 | 
         
            -
            from tqdm import tqdm
         
     | 
| 48 | 
         
            -
            ctr = -1
         
     | 
| 49 | 
         
            -
            num_chunks = len(gal_feeds)
         
     | 
| 50 | 
         
            -
            all_text = []
         
     | 
| 51 | 
         
            -
            all_titles = []
         
     | 
| 52 | 
         
            -
            all_arxivid = []
         
     | 
| 53 | 
         
            -
            all_links = []
         
     | 
| 54 | 
         
            -
             
     | 
| 55 | 
         
            -
            for nc in tqdm(range(num_chunks)):
         
     | 
| 56 | 
         
            -
                for i in range(len(gal_feeds[nc].entries)):
         
     | 
| 57 | 
         
            -
                    text = gal_feeds[nc].entries[i].summary
         
     | 
| 58 | 
         
            -
                    text = text.replace('\n', ' ')
         
     | 
| 59 | 
         
            -
                    text = text.replace('\\', '')
         
     | 
| 60 | 
         
            -
                    all_text.append(text)
         
     | 
| 61 | 
         
            -
                    all_titles.append(gal_feeds[nc].entries[i].title)
         
     | 
| 62 | 
         
            -
                    all_arxivid.append(gal_feeds[nc].entries[i].id.split('/')[-1][0:-2])
         
     | 
| 63 | 
         
            -
                    all_links.append(gal_feeds[nc].entries[i].links[1].href)
         
     | 
| 64 | 
         
            -
             
     | 
| 65 | 
         
            -
             
     | 
| 66 | 
         
            -
            def density_estimation(m1, m2, xmin=0, ymin=0, xmax=15, ymax=15):
         
     | 
| 67 | 
         
            -
                X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
         
     | 
| 68 | 
         
            -
                positions = np.vstack([X.ravel(), Y.ravel()])
         
     | 
| 69 | 
         
            -
                values = np.vstack([m1, m2])
         
     | 
| 70 | 
         
            -
                kernel = stats.gaussian_kde(values)
         
     | 
| 71 | 
         
            -
                Z = np.reshape(kernel(positions).T, X.shape)
         
     | 
| 72 | 
         
            -
                return X, Y, Z
         
     | 
| 73 | 
         
            -
             
     | 
| 74 | 
         
            -
            st.sidebar.markdown('This is a widget that allows you to look for papers containing specific phrases in the dataset and show it as a heatmap. Enter the phrase of interest, then change the size and opacity of the heatmap as desired to find the high-density regions. Hover over blue points to see the details of individual papers.')
         
     | 
| 75 | 
         
            -
            st.sidebar.markdown('`Note`: (i) if you enter a query that is not in the corpus of abstracts, it will return an error. just enter a different query in that case. (ii) there are some empty tooltips when you hover, these correspond to the underlying hexbins, and can be ignored.')
         
     | 
| 76 | 
         
            -
             
     | 
| 77 | 
         
            -
            st.sidebar.text_input("Search query", key="phrase", value="Quenching")
         
     | 
| 78 | 
         
            -
            alpha_value = st.sidebar.slider("Pick the hexbin opacity",0.0,1.0,0.81)
         
     | 
| 79 | 
         
            -
            size_value = st.sidebar.slider("Pick the hexbin gridsize",10,50,20)
         
     | 
| 80 | 
         
            -
             
     | 
| 81 | 
         
            -
            phrase=st.session_state.phrase
         
     | 
| 82 | 
         
            -
             
     | 
| 83 | 
         
            -
            phrase_flags = np.zeros((len(all_text),))
         
     | 
| 84 | 
         
            -
            for i in range(len(all_text)):
         
     | 
| 85 | 
         
            -
                if phrase.lower() in all_text[i].lower():
         
     | 
| 86 | 
         
            -
                    phrase_flags[i] = 1
         
     | 
| 87 | 
         
            -
             
     | 
| 88 | 
         
            -
             
     | 
| 89 | 
         
            -
            source = ColumnDataSource(data=dict(
         
     | 
| 90 | 
         
            -
                x=embedding[0:,0],
         
     | 
| 91 | 
         
            -
                y=embedding[0:,1],
         
     | 
| 92 | 
         
            -
                title=all_titles,
         
     | 
| 93 | 
         
            -
                link=all_links,
         
     | 
| 94 | 
         
            -
            ))
         
     | 
| 95 | 
         
            -
             
     | 
| 96 | 
         
            -
            TOOLTIPS = """
         
     | 
| 97 | 
         
            -
            <div style="width:300px;">
         
     | 
| 98 | 
         
            -
            ID: $index
         
     | 
| 99 | 
         
            -
            ($x, $y)
         
     | 
| 100 | 
         
            -
            @title <br>
         
     | 
| 101 | 
         
            -
            @link <br> <br>
         
     | 
| 102 | 
         
            -
            </div>
         
     | 
| 103 | 
         
            -
            """
         
     | 
| 104 | 
         
            -
             
     | 
| 105 | 
         
            -
            p = figure(width=700, height=583, tooltips=TOOLTIPS, x_range=(0, 15), y_range=(2.5,15),
         
     | 
| 106 | 
         
            -
                       title="UMAP projection of embeddings for the astro-ph.GA corpus"+phrase)
         
     | 
| 107 | 
         
            -
             
     | 
| 108 | 
         
            -
            # p.hexbin(embedding[phrase_flags==1,0],embedding[phrase_flags==1,1], size=size_value,
         
     | 
| 109 | 
         
            -
            #          palette = np.flip(OrRd[8]), alpha=alpha_value)
         
     | 
| 110 | 
         
            -
            p.circle('x', 'y', size=3, source=source, alpha=0.3)
         
     | 
| 111 | 
         
            -
            st.bokeh_chart(p)
         
     | 
| 112 | 
         
            -
             
     | 
| 113 | 
         
            -
            fig = plt.figure(figsize=(10.5,9*0.8328))
         
     | 
| 114 | 
         
            -
            plt.scatter(embedding[0:,0], embedding[0:,1],s=2,alpha=0.1)
         
     | 
| 115 | 
         
            -
            plt.hexbin(embedding[phrase_flags==1,0],embedding[phrase_flags==1,1],
         
     | 
| 116 | 
         
            -
            gridsize=size_value, cmap = 'viridis', alpha=alpha_value,extent=(-1,16,1.5,16),mincnt=10)
         
     | 
| 117 | 
         
            -
            plt.title("UMAP localization of heatmap keyword: "+phrase)
         
     | 
| 118 | 
         
            -
            plt.axis([0,15,2.5,15]);
         
     | 
| 119 | 
         
            -
            clbr = plt.colorbar(); clbr.set_label('# papers')
         
     | 
| 120 | 
         
            -
            plt.axis('off')
         
     | 
| 121 | 
         
            -
            st.pyplot(fig)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
    	
        pages/2_paper_search.py
    DELETED
    
    | 
         @@ -1,201 +0,0 @@ 
     | 
|
| 1 | 
         
            -
            import datetime, os
         
     | 
| 2 | 
         
            -
            from langchain.llms import OpenAI
         
     | 
| 3 | 
         
            -
            from langchain.embeddings import OpenAIEmbeddings
         
     | 
| 4 | 
         
            -
            import openai
         
     | 
| 5 | 
         
            -
            import faiss
         
     | 
| 6 | 
         
            -
            import streamlit as st
         
     | 
| 7 | 
         
            -
            import feedparser
         
     | 
| 8 | 
         
            -
            import urllib
         
     | 
| 9 | 
         
            -
            import cloudpickle as cp
         
     | 
| 10 | 
         
            -
            import pickle
         
     | 
| 11 | 
         
            -
            from urllib.request import urlopen
         
     | 
| 12 | 
         
            -
            from summa import summarizer
         
     | 
| 13 | 
         
            -
            import numpy as np
         
     | 
| 14 | 
         
            -
             
     | 
| 15 | 
         
            -
            # openai.organization = st.secrets.openai.org
         
     | 
| 16 | 
         
            -
            # openai.api_key = st.secrets.openai.api_key
         
     | 
| 17 | 
         
            -
            openai.organization = st.secrets["org"]
         
     | 
| 18 | 
         
            -
            openai.api_key = st.secrets["api_key"]
         
     | 
| 19 | 
         
            -
            os.environ["OPENAI_API_KEY"] = openai.api_key
         
     | 
| 20 | 
         
            -
             
     | 
| 21 | 
         
            -
            @st.cache_data
         
     | 
| 22 | 
         
            -
            def get_feeds_data(url):
         
     | 
| 23 | 
         
            -
                with open(url, "rb") as fp:
         
     | 
| 24 | 
         
            -
                    data = pickle.load(fp)
         
     | 
| 25 | 
         
            -
                st.sidebar.success("Loaded data!")
         
     | 
| 26 | 
         
            -
                # data = cp.load(urlopen(url))
         
     | 
| 27 | 
         
            -
                # st.sidebar.success("Fetched data from API!")
         
     | 
| 28 | 
         
            -
                return data
         
     | 
| 29 | 
         
            -
             
     | 
| 30 | 
         
            -
            embeddings = OpenAIEmbeddings()
         
     | 
| 31 | 
         
            -
             
     | 
| 32 | 
         
            -
            # feeds_link = "https://drive.google.com/uc?export=download&id=1-IPk1voyUM9VqnghwyVrM1dY6rFnn1S_"
         
     | 
| 33 | 
         
            -
            # embed_link = "https://dl.dropboxusercontent.com/s/ob2betm29qrtb8v/astro_ph_ga_feeds_ada_embedding_18-Apr-2023.pkl?dl=0"
         
     | 
| 34 | 
         
            -
             
     | 
| 35 | 
         
            -
            dateval = "27-Jun-2023"
         
     | 
| 36 | 
         
            -
            feeds_link = "local_files/astro_ph_ga_feeds_upto_"+dateval+".pkl"
         
     | 
| 37 | 
         
            -
            embed_link = "local_files/astro_ph_ga_feeds_ada_embedding_"+dateval+".pkl"
         
     | 
| 38 | 
         
            -
            gal_feeds = get_feeds_data(feeds_link)
         
     | 
| 39 | 
         
            -
            arxiv_ada_embeddings = get_feeds_data(embed_link)
         
     | 
| 40 | 
         
            -
             
     | 
| 41 | 
         
            -
            ctr = -1
         
     | 
| 42 | 
         
            -
            num_chunks = len(gal_feeds)
         
     | 
| 43 | 
         
            -
            all_text, all_titles, all_arxivid, all_links, all_authors = [], [], [], [], []
         
     | 
| 44 | 
         
            -
             
     | 
| 45 | 
         
            -
            for nc in range(num_chunks):
         
     | 
| 46 | 
         
            -
             
     | 
| 47 | 
         
            -
                for i in range(len(gal_feeds[nc].entries)):
         
     | 
| 48 | 
         
            -
                    text = gal_feeds[nc].entries[i].summary
         
     | 
| 49 | 
         
            -
                    text = text.replace('\n', ' ')
         
     | 
| 50 | 
         
            -
                    text = text.replace('\\', '')
         
     | 
| 51 | 
         
            -
                    all_text.append(text)
         
     | 
| 52 | 
         
            -
                    all_titles.append(gal_feeds[nc].entries[i].title)
         
     | 
| 53 | 
         
            -
                    all_arxivid.append(gal_feeds[nc].entries[i].id.split('/')[-1][0:-2])
         
     | 
| 54 | 
         
            -
                    all_links.append(gal_feeds[nc].entries[i].links[1].href)
         
     | 
| 55 | 
         
            -
                    all_authors.append(gal_feeds[nc].entries[i].authors)
         
     | 
| 56 | 
         
            -
             
     | 
| 57 | 
         
            -
            d = arxiv_ada_embeddings.shape[1]                           # dimension
         
     | 
| 58 | 
         
            -
            nb = arxiv_ada_embeddings.shape[0]                      # database size
         
     | 
| 59 | 
         
            -
            xb = arxiv_ada_embeddings.astype('float32')
         
     | 
| 60 | 
         
            -
            index = faiss.IndexFlatL2(d)
         
     | 
| 61 | 
         
            -
            index.add(xb)
         
     | 
| 62 | 
         
            -
             
     | 
| 63 | 
         
            -
            def run_simple_query(search_query = 'all:sed+fitting', max_results = 10, start = 0, sort_by = 'lastUpdatedDate', sort_order = 'descending'):
         
     | 
| 64 | 
         
            -
                """
         
     | 
| 65 | 
         
            -
                    Query ArXiv to return search results for a particular query
         
     | 
| 66 | 
         
            -
                    Parameters
         
     | 
| 67 | 
         
            -
                    ----------
         
     | 
| 68 | 
         
            -
                    query: str
         
     | 
| 69 | 
         
            -
                        query term. use prefixes ti, au, abs, co, jr, cat, m, id, all as applicable.
         
     | 
| 70 | 
         
            -
                    max_results: int, default = 10
         
     | 
| 71 | 
         
            -
                        number of results to return. numbers > 1000 generally lead to timeouts
         
     | 
| 72 | 
         
            -
                    start: int, default = 0
         
     | 
| 73 | 
         
            -
                        start index for results reported. use this if you're interested in running chunks.
         
     | 
| 74 | 
         
            -
                    Returns
         
     | 
| 75 | 
         
            -
                    -------
         
     | 
| 76 | 
         
            -
                    feed: dict
         
     | 
| 77 | 
         
            -
                        object containing requested results parsed with feedparser
         
     | 
| 78 | 
         
            -
                    Notes
         
     | 
| 79 | 
         
            -
                    -----
         
     | 
| 80 | 
         
            -
                        add functionality for chunk parsing, as well as storage and retreival
         
     | 
| 81 | 
         
            -
                    """
         
     | 
| 82 | 
         
            -
             
     | 
| 83 | 
         
            -
                # Base api query url
         
     | 
| 84 | 
         
            -
                base_url = 'http://export.arxiv.org/api/query?';
         
     | 
| 85 | 
         
            -
                query = 'search_query=%s&start=%i&max_results=%i&sortBy=%s&sortOrder=%s' % (search_query,
         
     | 
| 86 | 
         
            -
                                                                 start,
         
     | 
| 87 | 
         
            -
                                                                 max_results,sort_by,sort_order)
         
     | 
| 88 | 
         
            -
             
     | 
| 89 | 
         
            -
                response = urllib.request.urlopen(base_url+query).read()
         
     | 
| 90 | 
         
            -
                feed = feedparser.parse(response)
         
     | 
| 91 | 
         
            -
                return feed
         
     | 
| 92 | 
         
            -
             
     | 
| 93 | 
         
            -
            def find_papers_by_author(auth_name):
         
     | 
| 94 | 
         
            -
             
     | 
| 95 | 
         
            -
                doc_ids = []
         
     | 
| 96 | 
         
            -
                for doc_id in range(len(all_authors)):
         
     | 
| 97 | 
         
            -
                    for auth_id in range(len(all_authors[doc_id])):
         
     | 
| 98 | 
         
            -
                        if auth_name.lower() in all_authors[doc_id][auth_id]['name'].lower():
         
     | 
| 99 | 
         
            -
                            print('Doc ID: ',doc_id, ' | arXiv: ', all_arxivid[doc_id], '| ', all_titles[doc_id],' | Author entry: ', all_authors[doc_id][auth_id]['name'])
         
     | 
| 100 | 
         
            -
                            doc_ids.append(doc_id)
         
     | 
| 101 | 
         
            -
             
     | 
| 102 | 
         
            -
                return doc_ids
         
     | 
| 103 | 
         
            -
             
     | 
| 104 | 
         
            -
            def faiss_based_indices(input_vector, nindex=10):
         
     | 
| 105 | 
         
            -
                xq = input_vector.reshape(-1,1).T.astype('float32')
         
     | 
| 106 | 
         
            -
                D, I = index.search(xq, nindex)
         
     | 
| 107 | 
         
            -
                return I[0], D[0]
         
     | 
| 108 | 
         
            -
             
     | 
| 109 | 
         
            -
             
     | 
| 110 | 
         
            -
            def list_similar_papers_v2(model_data,
         
     | 
| 111 | 
         
            -
                                    doc_id = [], input_type = 'doc_id',
         
     | 
| 112 | 
         
            -
                                    show_authors = False, show_summary = False,
         
     | 
| 113 | 
         
            -
                                    return_n = 10):
         
     | 
| 114 | 
         
            -
             
     | 
| 115 | 
         
            -
                arxiv_ada_embeddings, embeddings, all_titles, all_abstracts, all_authors = model_data
         
     | 
| 116 | 
         
            -
             
     | 
| 117 | 
         
            -
                if input_type == 'doc_id':
         
     | 
| 118 | 
         
            -
                    print('Doc ID: ',doc_id,', title: ',all_titles[doc_id])
         
     | 
| 119 | 
         
            -
            #         inferred_vector = model.infer_vector(train_corpus[doc_id].words)
         
     | 
| 120 | 
         
            -
                    inferred_vector = arxiv_ada_embeddings[doc_id,0:]
         
     | 
| 121 | 
         
            -
                    start_range = 1
         
     | 
| 122 | 
         
            -
                elif input_type == 'arxiv_id':
         
     | 
| 123 | 
         
            -
                    print('ArXiv id: ',doc_id)
         
     | 
| 124 | 
         
            -
                    arxiv_query_feed = run_simple_query(search_query='id:'+str(doc_id))
         
     | 
| 125 | 
         
            -
                    if len(arxiv_query_feed.entries) == 0:
         
     | 
| 126 | 
         
            -
                        print('error: arxiv id not found.')
         
     | 
| 127 | 
         
            -
                        return
         
     | 
| 128 | 
         
            -
                    else:
         
     | 
| 129 | 
         
            -
                        print('Title: '+arxiv_query_feed.entries[0].title)
         
     | 
| 130 | 
         
            -
                        inferred_vector = np.array(embeddings.embed_query(arxiv_query_feed.entries[0].summary))
         
     | 
| 131 | 
         
            -
            #         arxiv_query_tokens = gensim.utils.simple_preprocess(arxiv_query_feed.entries[0].summary)
         
     | 
| 132 | 
         
            -
            #         inferred_vector = model.infer_vector(arxiv_query_tokens)
         
     | 
| 133 | 
         
            -
             
     | 
| 134 | 
         
            -
                    start_range = 0
         
     | 
| 135 | 
         
            -
                elif input_type == 'keywords':
         
     | 
| 136 | 
         
            -
            #         print('Keyword(s): ',[doc_id[i] for i in range(len(doc_id))])
         
     | 
| 137 | 
         
            -
            #         word_vector = model.wv[doc_id[0]]
         
     | 
| 138 | 
         
            -
            #         if len(doc_id) > 1:
         
     | 
| 139 | 
         
            -
            #            print('multi-keyword')
         
     | 
| 140 | 
         
            -
            #            for i in range(1,len(doc_id)):
         
     | 
| 141 | 
         
            -
            #                word_vector = word_vector + model.wv[doc_id[i]]
         
     | 
| 142 | 
         
            -
            # #         word_vector = model.infer_vector(doc_id)
         
     | 
| 143 | 
         
            -
            #         inferred_vector = word_vector
         
     | 
| 144 | 
         
            -
                    inferred_vector = np.array(embeddings.embed_query(doc_id))
         
     | 
| 145 | 
         
            -
                    start_range = 0
         
     | 
| 146 | 
         
            -
                else:
         
     | 
| 147 | 
         
            -
                    print('unrecognized input type.')
         
     | 
| 148 | 
         
            -
                    return
         
     | 
| 149 | 
         
            -
             
     | 
| 150 | 
         
            -
            #     sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
         
     | 
| 151 | 
         
            -
                sims, dists = faiss_based_indices(inferred_vector, return_n+2)
         
     | 
| 152 | 
         
            -
                textstr = ''
         
     | 
| 153 | 
         
            -
             
     | 
| 154 | 
         
            -
                textstr = textstr + '-----------------------------\n'
         
     | 
| 155 | 
         
            -
                textstr = textstr + 'Most similar/relevant papers: \n'
         
     | 
| 156 | 
         
            -
                textstr = textstr + '-----------------------------\n\n'
         
     | 
| 157 | 
         
            -
                for i in range(start_range,start_range+return_n):
         
     | 
| 158 | 
         
            -
             
     | 
| 159 | 
         
            -
                    # print(i, all_titles[sims[i]], ' (Distance: %.2f' %dists[i] ,')')
         
     | 
| 160 | 
         
            -
                    textstr = textstr + str(i+1)+'. **'+ all_titles[sims[i]] +'** (Distance: %.2f' %dists[i]+')   \n'
         
     | 
| 161 | 
         
            -
                    textstr = textstr + '**ArXiv:** ['+all_arxivid[sims[i]]+'](https://arxiv.org/abs/'+all_arxivid[sims[i]]+')  \n'
         
     | 
| 162 | 
         
            -
                    if show_authors == True:
         
     | 
| 163 | 
         
            -
                        textstr = textstr + '**Authors:**  '
         
     | 
| 164 | 
         
            -
                        temp = all_authors[sims[i]]
         
     | 
| 165 | 
         
            -
                        for ak in range(len(temp)):
         
     | 
| 166 | 
         
            -
                            if ak < len(temp)-1:
         
     | 
| 167 | 
         
            -
                                textstr = textstr + temp[ak].name + ', '
         
     | 
| 168 | 
         
            -
                            else:
         
     | 
| 169 | 
         
            -
                                textstr = textstr + temp[ak].name + '   \n'
         
     | 
| 170 | 
         
            -
                    if show_summary == True:
         
     | 
| 171 | 
         
            -
                        textstr = textstr + '**Summary:**  '
         
     | 
| 172 | 
         
            -
                        text = all_text[sims[i]]
         
     | 
| 173 | 
         
            -
                        text = text.replace('\n', ' ')
         
     | 
| 174 | 
         
            -
                        textstr = textstr + summarizer.summarize(text) + '  \n'
         
     | 
| 175 | 
         
            -
                    if show_authors == True or show_summary == True:
         
     | 
| 176 | 
         
            -
                        textstr = textstr + ' '
         
     | 
| 177 | 
         
            -
                    textstr = textstr + '  \n'
         
     | 
| 178 | 
         
            -
                return textstr
         
     | 
| 179 | 
         
            -
             
     | 
| 180 | 
         
            -
             
     | 
| 181 | 
         
            -
            model_data = [arxiv_ada_embeddings, embeddings, all_titles, all_text, all_authors]
         
     | 
| 182 | 
         
            -
             
     | 
| 183 | 
         
            -
            st.title('ArXiv similarity search:')
         
     | 
| 184 | 
         
            -
            st.markdown('Search for similar papers by arxiv id or phrase:')
         
     | 
| 185 | 
         
            -
            st.markdown('[Includes papers up to: `'+dateval+'`]')
         
     | 
| 186 | 
         
            -
             
     | 
| 187 | 
         
            -
            search_type = st.radio(
         
     | 
| 188 | 
         
            -
                "What are you searching by?",
         
     | 
| 189 | 
         
            -
                ('arxiv id', 'text query'), index=1)
         
     | 
| 190 | 
         
            -
             
     | 
| 191 | 
         
            -
            query = st.text_input('Search query or arxivid', value="what causes galaxy quenching?")
         
     | 
| 192 | 
         
            -
            show_authors = st.checkbox('Show author information', value = True)
         
     | 
| 193 | 
         
            -
            show_summary = st.checkbox('Show paper summary', value = True)
         
     | 
| 194 | 
         
            -
            return_n = st.slider('How many papers should I show?', 1, 30, 10)
         
     | 
| 195 | 
         
            -
             
     | 
| 196 | 
         
            -
            if search_type == 'arxiv id':
         
     | 
| 197 | 
         
            -
                sims = list_similar_papers_v2(model_data, doc_id = query, input_type='arxiv_id', show_authors = show_authors, show_summary = show_summary, return_n = return_n)
         
     | 
| 198 | 
         
            -
            else:
         
     | 
| 199 | 
         
            -
                sims = list_similar_papers_v2(model_data, doc_id = query, input_type='keywords', show_authors = show_authors, show_summary = show_summary, return_n = return_n)
         
     | 
| 200 | 
         
            -
             
     | 
| 201 | 
         
            -
            st.markdown(sims)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
    	
        pages/3_answering_questions.py
    DELETED
    
    | 
         @@ -1,352 +0,0 @@ 
     | 
|
| 1 | 
         
            -
            import os
         
     | 
| 2 | 
         
            -
            import datetime
         
     | 
| 3 | 
         
            -
            import faiss
         
     | 
| 4 | 
         
            -
            import streamlit as st
         
     | 
| 5 | 
         
            -
            import feedparser
         
     | 
| 6 | 
         
            -
            import urllib
         
     | 
| 7 | 
         
            -
            import cloudpickle as cp
         
     | 
| 8 | 
         
            -
            import pickle
         
     | 
| 9 | 
         
            -
            from urllib.request import urlopen
         
     | 
| 10 | 
         
            -
            from summa import summarizer
         
     | 
| 11 | 
         
            -
            import numpy as np
         
     | 
| 12 | 
         
            -
            import matplotlib.pyplot as plt
         
     | 
| 13 | 
         
            -
            import requests
         
     | 
| 14 | 
         
            -
            import json
         
     | 
| 15 | 
         
            -
             
     | 
| 16 | 
         
            -
            from langchain.document_loaders import TextLoader
         
     | 
| 17 | 
         
            -
            from langchain.indexes import VectorstoreIndexCreator
         
     | 
| 18 | 
         
            -
            from langchain_openai import AzureOpenAIEmbeddings
         
     | 
| 19 | 
         
            -
            from langchain.llms import OpenAI
         
     | 
| 20 | 
         
            -
            from langchain_openai import AzureChatOpenAI
         
     | 
| 21 | 
         
            -
            from langchain import hub
         
     | 
| 22 | 
         
            -
            from langchain_core.prompts import PromptTemplate
         
     | 
| 23 | 
         
            -
            from langchain_core.runnables import RunnablePassthrough
         
     | 
| 24 | 
         
            -
            from langchain_core.output_parsers import StrOutputParser
         
     | 
| 25 | 
         
            -
            from langchain_core.runnables import RunnableParallel
         
     | 
| 26 | 
         
            -
            from langchain.text_splitter import RecursiveCharacterTextSplitter
         
     | 
| 27 | 
         
            -
            from langchain_community.vectorstores import Chroma
         
     | 
| 28 | 
         
            -
             
     | 
| 29 | 
         
            -
            os.environ["OPENAI_API_TYPE"] = "azure"
         
     | 
| 30 | 
         
            -
            os.environ["AZURE_ENDPOINT"] = st.secrets["endpoint1"]
         
     | 
| 31 | 
         
            -
            os.environ["OPENAI_API_KEY"] = st.secrets["key1"]
         
     | 
| 32 | 
         
            -
            os.environ["OPENAI_API_VERSION"] = "2023-05-15"
         
     | 
| 33 | 
         
            -
             
     | 
| 34 | 
         
            -
            embeddings = AzureOpenAIEmbeddings(
         
     | 
| 35 | 
         
            -
                deployment="embedding",
         
     | 
| 36 | 
         
            -
                model="text-embedding-ada-002",
         
     | 
| 37 | 
         
            -
                azure_endpoint=st.secrets["endpoint1"],
         
     | 
| 38 | 
         
            -
            )
         
     | 
| 39 | 
         
            -
             
     | 
| 40 | 
         
            -
            llm = AzureChatOpenAI(
         
     | 
| 41 | 
         
            -
                    deployment_name="gpt4_small",
         
     | 
| 42 | 
         
            -
                    openai_api_version="2023-12-01-preview",
         
     | 
| 43 | 
         
            -
                    azure_endpoint=st.secrets["endpoint2"],
         
     | 
| 44 | 
         
            -
                    openai_api_key=st.secrets["key2"],
         
     | 
| 45 | 
         
            -
                    openai_api_type="azure",
         
     | 
| 46 | 
         
            -
                    temperature=0.
         
     | 
| 47 | 
         
            -
                )
         
     | 
| 48 | 
         
            -
             
     | 
| 49 | 
         
            -
             
     | 
| 50 | 
         
            -
            @st.cache_data
         
     | 
| 51 | 
         
            -
            def get_feeds_data(url):
         
     | 
| 52 | 
         
            -
                # data = cp.load(urlopen(url))
         
     | 
| 53 | 
         
            -
                with open(url, "rb") as fp:
         
     | 
| 54 | 
         
            -
                    data = pickle.load(fp)
         
     | 
| 55 | 
         
            -
                st.sidebar.success("Loaded data")
         
     | 
| 56 | 
         
            -
                return data
         
     | 
| 57 | 
         
            -
             
     | 
| 58 | 
         
            -
            # feeds_link = "https://drive.google.com/uc?export=download&id=1-IPk1voyUM9VqnghwyVrM1dY6rFnn1S_"
         
     | 
| 59 | 
         
            -
            # embed_link = "https://dl.dropboxusercontent.com/s/ob2betm29qrtb8v/astro_ph_ga_feeds_ada_embedding_18-Apr-2023.pkl?dl=0"
         
     | 
| 60 | 
         
            -
            dateval = "27-Jun-2023"
         
     | 
| 61 | 
         
            -
            feeds_link = "local_files/astro_ph_ga_feeds_upto_"+dateval+".pkl"
         
     | 
| 62 | 
         
            -
            embed_link = "local_files/astro_ph_ga_feeds_ada_embedding_"+dateval+".pkl"
         
     | 
| 63 | 
         
            -
            gal_feeds = get_feeds_data(feeds_link)
         
     | 
| 64 | 
         
            -
            arxiv_ada_embeddings = get_feeds_data(embed_link)
         
     | 
| 65 | 
         
            -
             
     | 
| 66 | 
         
            -
            @st.cache_data
         
     | 
| 67 | 
         
            -
            def get_embedding_data(url):
         
     | 
| 68 | 
         
            -
                # data = cp.load(urlopen(url))
         
     | 
| 69 | 
         
            -
                with open(url, "rb") as fp:
         
     | 
| 70 | 
         
            -
                    data = pickle.load(fp)
         
     | 
| 71 | 
         
            -
                st.sidebar.success("Fetched data from API!")
         
     | 
| 72 | 
         
            -
                return data
         
     | 
| 73 | 
         
            -
             
     | 
| 74 | 
         
            -
            # url = "https://drive.google.com/uc?export=download&id=1133tynMwsfdR1wxbkFLhbES3FwDWTPjP"
         
     | 
| 75 | 
         
            -
            url = "local_files/astro_ph_ga_embedding_"+dateval+".pkl"
         
     | 
| 76 | 
         
            -
            e2d = get_embedding_data(url)
         
     | 
| 77 | 
         
            -
            # e2d, _, _, _, _ = get_embedding_data(url)
         
     | 
| 78 | 
         
            -
             
     | 
| 79 | 
         
            -
            ctr = -1
         
     | 
| 80 | 
         
            -
            num_chunks = len(gal_feeds)
         
     | 
| 81 | 
         
            -
            all_text, all_titles, all_arxivid, all_links, all_authors = [], [], [], [], []
         
     | 
| 82 | 
         
            -
             
     | 
| 83 | 
         
            -
            for nc in range(num_chunks):
         
     | 
| 84 | 
         
            -
             
     | 
| 85 | 
         
            -
                for i in range(len(gal_feeds[nc].entries)):
         
     | 
| 86 | 
         
            -
                    text = gal_feeds[nc].entries[i].summary
         
     | 
| 87 | 
         
            -
                    text = text.replace('\n', ' ')
         
     | 
| 88 | 
         
            -
                    text = text.replace('\\', '')
         
     | 
| 89 | 
         
            -
                    all_text.append(text)
         
     | 
| 90 | 
         
            -
                    all_titles.append(gal_feeds[nc].entries[i].title)
         
     | 
| 91 | 
         
            -
                    all_arxivid.append(gal_feeds[nc].entries[i].id.split('/')[-1][0:-2])
         
     | 
| 92 | 
         
            -
                    all_links.append(gal_feeds[nc].entries[i].links[1].href)
         
     | 
| 93 | 
         
            -
                    all_authors.append(gal_feeds[nc].entries[i].authors)
         
     | 
| 94 | 
         
            -
             
     | 
| 95 | 
         
            -
            d = arxiv_ada_embeddings.shape[1]                           # dimension
         
     | 
| 96 | 
         
            -
            nb = arxiv_ada_embeddings.shape[0]                      # database size
         
     | 
| 97 | 
         
            -
            xb = arxiv_ada_embeddings.astype('float32')
         
     | 
| 98 | 
         
            -
            index = faiss.IndexFlatL2(d)
         
     | 
| 99 | 
         
            -
            index.add(xb)
         
     | 
| 100 | 
         
            -
             
     | 
| 101 | 
         
            -
            def run_simple_query(search_query = 'all:sed+fitting', max_results = 10, start = 0, sort_by = 'lastUpdatedDate', sort_order = 'descending'):
         
     | 
| 102 | 
         
            -
                """
         
     | 
| 103 | 
         
            -
                    Query ArXiv to return search results for a particular query
         
     | 
| 104 | 
         
            -
                    Parameters
         
     | 
| 105 | 
         
            -
                    ----------
         
     | 
| 106 | 
         
            -
                    query: str
         
     | 
| 107 | 
         
            -
                        query term. use prefixes ti, au, abs, co, jr, cat, m, id, all as applicable.
         
     | 
| 108 | 
         
            -
                    max_results: int, default = 10
         
     | 
| 109 | 
         
            -
                        number of results to return. numbers > 1000 generally lead to timeouts
         
     | 
| 110 | 
         
            -
                    start: int, default = 0
         
     | 
| 111 | 
         
            -
                        start index for results reported. use this if you're interested in running chunks.
         
     | 
| 112 | 
         
            -
                    Returns
         
     | 
| 113 | 
         
            -
                    -------
         
     | 
| 114 | 
         
            -
                    feed: dict
         
     | 
| 115 | 
         
            -
                        object containing requested results parsed with feedparser
         
     | 
| 116 | 
         
            -
                    Notes
         
     | 
| 117 | 
         
            -
                    -----
         
     | 
| 118 | 
         
            -
                        add functionality for chunk parsing, as well as storage and retreival
         
     | 
| 119 | 
         
            -
                    """
         
     | 
| 120 | 
         
            -
             
     | 
| 121 | 
         
            -
                base_url = 'http://export.arxiv.org/api/query?';
         
     | 
| 122 | 
         
            -
                query = 'search_query=%s&start=%i&max_results=%i&sortBy=%s&sortOrder=%s' % (search_query,
         
     | 
| 123 | 
         
            -
                                                                 start,
         
     | 
| 124 | 
         
            -
                                                                 max_results,sort_by,sort_order)
         
     | 
| 125 | 
         
            -
             
     | 
| 126 | 
         
            -
                response = urllib.request.urlopen(base_url+query).read()
         
     | 
| 127 | 
         
            -
                feed = feedparser.parse(response)
         
     | 
| 128 | 
         
            -
                return feed
         
     | 
| 129 | 
         
            -
             
     | 
| 130 | 
         
            -
            def find_papers_by_author(auth_name):
         
     | 
| 131 | 
         
            -
             
     | 
| 132 | 
         
            -
                doc_ids = []
         
     | 
| 133 | 
         
            -
                for doc_id in range(len(all_authors)):
         
     | 
| 134 | 
         
            -
                    for auth_id in range(len(all_authors[doc_id])):
         
     | 
| 135 | 
         
            -
                        if auth_name.lower() in all_authors[doc_id][auth_id]['name'].lower():
         
     | 
| 136 | 
         
            -
                            print('Doc ID: ',doc_id, ' | arXiv: ', all_arxivid[doc_id], '| ', all_titles[doc_id],' | Author entry: ', all_authors[doc_id][auth_id]['name'])
         
     | 
| 137 | 
         
            -
                            doc_ids.append(doc_id)
         
     | 
| 138 | 
         
            -
             
     | 
| 139 | 
         
            -
                return doc_ids
         
     | 
| 140 | 
         
            -
             
     | 
| 141 | 
         
            -
            def faiss_based_indices(input_vector, nindex=10):
         
     | 
| 142 | 
         
            -
                xq = input_vector.reshape(-1,1).T.astype('float32')
         
     | 
| 143 | 
         
            -
                D, I = index.search(xq, nindex)
         
     | 
| 144 | 
         
            -
                return I[0], D[0]
         
     | 
| 145 | 
         
            -
             
     | 
| 146 | 
         
            -
            def list_similar_papers_v2(model_data,
         
     | 
| 147 | 
         
            -
                                    doc_id = [], input_type = 'doc_id',
         
     | 
| 148 | 
         
            -
                                    show_authors = False, show_summary = False,
         
     | 
| 149 | 
         
            -
                                    return_n = 10):
         
     | 
| 150 | 
         
            -
             
     | 
| 151 | 
         
            -
                arxiv_ada_embeddings, embeddings, all_titles, all_abstracts, all_authors = model_data
         
     | 
| 152 | 
         
            -
             
     | 
| 153 | 
         
            -
                if input_type == 'doc_id':
         
     | 
| 154 | 
         
            -
                    print('Doc ID: ',doc_id,', title: ',all_titles[doc_id])
         
     | 
| 155 | 
         
            -
            #         inferred_vector = model.infer_vector(train_corpus[doc_id].words)
         
     | 
| 156 | 
         
            -
                    inferred_vector = arxiv_ada_embeddings[doc_id,0:]
         
     | 
| 157 | 
         
            -
                    start_range = 1
         
     | 
| 158 | 
         
            -
                elif input_type == 'arxiv_id':
         
     | 
| 159 | 
         
            -
                    print('ArXiv id: ',doc_id)
         
     | 
| 160 | 
         
            -
                    arxiv_query_feed = run_simple_query(search_query='id:'+str(doc_id))
         
     | 
| 161 | 
         
            -
                    if len(arxiv_query_feed.entries) == 0:
         
     | 
| 162 | 
         
            -
                        print('error: arxiv id not found.')
         
     | 
| 163 | 
         
            -
                        return
         
     | 
| 164 | 
         
            -
                    else:
         
     | 
| 165 | 
         
            -
                        print('Title: '+arxiv_query_feed.entries[0].title)
         
     | 
| 166 | 
         
            -
                        inferred_vector = np.array(embeddings.embed_query(arxiv_query_feed.entries[0].summary))
         
     | 
| 167 | 
         
            -
                    start_range = 0
         
     | 
| 168 | 
         
            -
                elif input_type == 'keywords':
         
     | 
| 169 | 
         
            -
                    inferred_vector = np.array(embeddings.embed_query(doc_id))
         
     | 
| 170 | 
         
            -
                    start_range = 0
         
     | 
| 171 | 
         
            -
                else:
         
     | 
| 172 | 
         
            -
                    print('unrecognized input type.')
         
     | 
| 173 | 
         
            -
                    return
         
     | 
| 174 | 
         
            -
             
     | 
| 175 | 
         
            -
                sims, dists = faiss_based_indices(inferred_vector, return_n+2)
         
     | 
| 176 | 
         
            -
                textstr = ''
         
     | 
| 177 | 
         
            -
                abstracts_relevant = []
         
     | 
| 178 | 
         
            -
                fhdrs = []
         
     | 
| 179 | 
         
            -
             
     | 
| 180 | 
         
            -
                for i in range(start_range,start_range+return_n):
         
     | 
| 181 | 
         
            -
             
     | 
| 182 | 
         
            -
                    abstracts_relevant.append(all_text[sims[i]])
         
     | 
| 183 | 
         
            -
                    fhdr = str(sims[i])+'_'+all_authors[sims[i]][0]['name'].split()[-1] + all_arxivid[sims[i]][0:2] +'_'+ all_arxivid[sims[i]]
         
     | 
| 184 | 
         
            -
                    fhdrs.append(fhdr)
         
     | 
| 185 | 
         
            -
                    textstr = textstr + str(i+1)+'. **'+ all_titles[sims[i]] +'** (Distance: %.2f' %dists[i]+')   \n'
         
     | 
| 186 | 
         
            -
                    textstr = textstr + '**ArXiv:** ['+all_arxivid[sims[i]]+'](https://arxiv.org/abs/'+all_arxivid[sims[i]]+')  \n'
         
     | 
| 187 | 
         
            -
                    if show_authors == True:
         
     | 
| 188 | 
         
            -
                        textstr = textstr + '**Authors:**  '
         
     | 
| 189 | 
         
            -
                        temp = all_authors[sims[i]]
         
     | 
| 190 | 
         
            -
                        for ak in range(len(temp)):
         
     | 
| 191 | 
         
            -
                            if ak < len(temp)-1:
         
     | 
| 192 | 
         
            -
                                textstr = textstr + temp[ak].name + ', '
         
     | 
| 193 | 
         
            -
                            else:
         
     | 
| 194 | 
         
            -
                                textstr = textstr + temp[ak].name + '   \n'
         
     | 
| 195 | 
         
            -
                    if show_summary == True:
         
     | 
| 196 | 
         
            -
                        textstr = textstr + '**Summary:**  '
         
     | 
| 197 | 
         
            -
                        text = all_text[sims[i]]
         
     | 
| 198 | 
         
            -
                        text = text.replace('\n', ' ')
         
     | 
| 199 | 
         
            -
                        textstr = textstr + summarizer.summarize(text) + '  \n'
         
     | 
| 200 | 
         
            -
                    if show_authors == True or show_summary == True:
         
     | 
| 201 | 
         
            -
                        textstr = textstr + ' '
         
     | 
| 202 | 
         
            -
                    textstr = textstr + '  \n'
         
     | 
| 203 | 
         
            -
                return textstr, abstracts_relevant, fhdrs, sims
         
     | 
| 204 | 
         
            -
             
     | 
| 205 | 
         
            -
             
     | 
| 206 | 
         
            -
            def generate_chat_completion(messages, model="gpt-4", temperature=1, max_tokens=None):
         
     | 
| 207 | 
         
            -
                headers = {
         
     | 
| 208 | 
         
            -
                    "Content-Type": "application/json",
         
     | 
| 209 | 
         
            -
                    "Authorization": f"Bearer {openai.api_key}",
         
     | 
| 210 | 
         
            -
                }
         
     | 
| 211 | 
         
            -
             
     | 
| 212 | 
         
            -
                data = {
         
     | 
| 213 | 
         
            -
                    "model": model,
         
     | 
| 214 | 
         
            -
                    "messages": messages,
         
     | 
| 215 | 
         
            -
                    "temperature": temperature,
         
     | 
| 216 | 
         
            -
                }
         
     | 
| 217 | 
         
            -
             
     | 
| 218 | 
         
            -
                if max_tokens is not None:
         
     | 
| 219 | 
         
            -
                    data["max_tokens"] = max_tokens
         
     | 
| 220 | 
         
            -
                response = requests.post(API_ENDPOINT, headers=headers, data=json.dumps(data))
         
     | 
| 221 | 
         
            -
                if response.status_code == 200:
         
     | 
| 222 | 
         
            -
                    return response.json()["choices"][0]["message"]["content"]
         
     | 
| 223 | 
         
            -
                else:
         
     | 
| 224 | 
         
            -
                    raise Exception(f"Error {response.status_code}: {response.text}")
         
     | 
| 225 | 
         
            -
             
     | 
| 226 | 
         
            -
            model_data = [arxiv_ada_embeddings, embeddings, all_titles, all_text, all_authors]
         
     | 
| 227 | 
         
            -
             
     | 
| 228 | 
         
            -
            def format_docs(docs):
         
     | 
| 229 | 
         
            -
                return "\n\n".join(doc.page_content for doc in docs)
         
     | 
| 230 | 
         
            -
             
     | 
| 231 | 
         
            -
            def get_textstr(i, show_authors=False, show_summary=False):
         
     | 
| 232 | 
         
            -
                textstr = ''
         
     | 
| 233 | 
         
            -
                textstr = '**'+ all_titles[i] +'**   \n'
         
     | 
| 234 | 
         
            -
                textstr = textstr + '**ArXiv:** ['+all_arxivid[i]+'](https://arxiv.org/abs/'+all_arxivid[i]+')  \n'
         
     | 
| 235 | 
         
            -
                if show_authors == True:
         
     | 
| 236 | 
         
            -
                    textstr = textstr + '**Authors:**  '
         
     | 
| 237 | 
         
            -
                    temp = all_authors[i]
         
     | 
| 238 | 
         
            -
                    for ak in range(len(temp)):
         
     | 
| 239 | 
         
            -
                        if ak < len(temp)-1:
         
     | 
| 240 | 
         
            -
                            textstr = textstr + temp[ak].name + ', '
         
     | 
| 241 | 
         
            -
                        else:
         
     | 
| 242 | 
         
            -
                            textstr = textstr + temp[ak].name + '   \n'
         
     | 
| 243 | 
         
            -
                if show_summary == True:
         
     | 
| 244 | 
         
            -
                    textstr = textstr + '**Summary:**  '
         
     | 
| 245 | 
         
            -
                    text = all_text[i]
         
     | 
| 246 | 
         
            -
                    text = text.replace('\n', ' ')
         
     | 
| 247 | 
         
            -
                    textstr = textstr + summarizer.summarize(text) + '  \n'
         
     | 
| 248 | 
         
            -
                if show_authors == True or show_summary == True:
         
     | 
| 249 | 
         
            -
                    textstr = textstr + ' '
         
     | 
| 250 | 
         
            -
                textstr = textstr + '  \n'
         
     | 
| 251 | 
         
            -
             
     | 
| 252 | 
         
            -
                return textstr
         
     | 
| 253 | 
         
            -
             
     | 
| 254 | 
         
            -
             
     | 
| 255 | 
         
            -
            def run_rag(query, return_n = 10, show_authors = True, show_summary = True):
         
     | 
| 256 | 
         
            -
             
     | 
| 257 | 
         
            -
                sims, absts, fhdrs, simids = list_similar_papers_v2(model_data,
         
     | 
| 258 | 
         
            -
                                              doc_id = query,
         
     | 
| 259 | 
         
            -
                                              input_type='keywords',
         
     | 
| 260 | 
         
            -
                                              show_authors = show_authors, show_summary = show_summary,
         
     | 
| 261 | 
         
            -
                                              return_n = return_n)
         
     | 
| 262 | 
         
            -
             
     | 
| 263 | 
         
            -
                temp_abst = ''
         
     | 
| 264 | 
         
            -
                loaders = []
         
     | 
| 265 | 
         
            -
                for i in range(len(absts)):
         
     | 
| 266 | 
         
            -
                    temp_abst = absts[i]
         
     | 
| 267 | 
         
            -
             
     | 
| 268 | 
         
            -
                    try:
         
     | 
| 269 | 
         
            -
                        text_file = open("absts/"+fhdrs[i]+".txt", "w")
         
     | 
| 270 | 
         
            -
                    except:
         
     | 
| 271 | 
         
            -
                        os.mkdir('absts')
         
     | 
| 272 | 
         
            -
                        text_file = open("absts/"+fhdrs[i]+".txt", "w")
         
     | 
| 273 | 
         
            -
                    n = text_file.write(temp_abst)
         
     | 
| 274 | 
         
            -
                    text_file.close()
         
     | 
| 275 | 
         
            -
                    loader = TextLoader("absts/"+fhdrs[i]+".txt")
         
     | 
| 276 | 
         
            -
                    loaders.append(loader)
         
     | 
| 277 | 
         
            -
             
     | 
| 278 | 
         
            -
                text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
         
     | 
| 279 | 
         
            -
                splits = text_splitter.split_documents([loader.load()[0] for loader in loaders])
         
     | 
| 280 | 
         
            -
                vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)
         
     | 
| 281 | 
         
            -
                retriever = vectorstore.as_retriever()
         
     | 
| 282 | 
         
            -
             
     | 
| 283 | 
         
            -
                template = """You are an assistant with expertise in astrophysics for question-answering tasks.
         
     | 
| 284 | 
         
            -
                Use the following pieces of retrieved context from the literature to answer the question.
         
     | 
| 285 | 
         
            -
                If you don't know the answer, just say that you don't know.
         
     | 
| 286 | 
         
            -
                Use six sentences maximum and keep the answer concise.
         
     | 
| 287 | 
         
            -
             
     | 
| 288 | 
         
            -
                {context}
         
     | 
| 289 | 
         
            -
             
     | 
| 290 | 
         
            -
                Question: {question}
         
     | 
| 291 | 
         
            -
             
     | 
| 292 | 
         
            -
                Answer:"""
         
     | 
| 293 | 
         
            -
                custom_rag_prompt = PromptTemplate.from_template(template)
         
     | 
| 294 | 
         
            -
             
     | 
| 295 | 
         
            -
                rag_chain_from_docs = (
         
     | 
| 296 | 
         
            -
                    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
         
     | 
| 297 | 
         
            -
                    | custom_rag_prompt
         
     | 
| 298 | 
         
            -
                    | llm
         
     | 
| 299 | 
         
            -
                    | StrOutputParser()
         
     | 
| 300 | 
         
            -
                )
         
     | 
| 301 | 
         
            -
             
     | 
| 302 | 
         
            -
                rag_chain_with_source = RunnableParallel(
         
     | 
| 303 | 
         
            -
                    {"context": retriever, "question": RunnablePassthrough()}
         
     | 
| 304 | 
         
            -
                ).assign(answer=rag_chain_from_docs)
         
     | 
| 305 | 
         
            -
             
     | 
| 306 | 
         
            -
                rag_answer = rag_chain_with_source.invoke(query)
         
     | 
| 307 | 
         
            -
             
     | 
| 308 | 
         
            -
                st.markdown('### User query: '+query)
         
     | 
| 309 | 
         
            -
             
     | 
| 310 | 
         
            -
                st.markdown(rag_answer['answer'])
         
     | 
| 311 | 
         
            -
                opstr = '#### Primary sources: \n'
         
     | 
| 312 | 
         
            -
                srcnames = []
         
     | 
| 313 | 
         
            -
                for i in range(len(rag_answer['context'])):
         
     | 
| 314 | 
         
            -
                    srcnames.append(rag_answer['context'][0].metadata['source'])
         
     | 
| 315 | 
         
            -
             
     | 
| 316 | 
         
            -
                srcnames = np.unique(srcnames)
         
     | 
| 317 | 
         
            -
                srcindices = []
         
     | 
| 318 | 
         
            -
                for i in range(len(srcnames)):
         
     | 
| 319 | 
         
            -
                    temp = srcnames[i].split('_')[1]
         
     | 
| 320 | 
         
            -
                    srcindices.append(int(srcnames[i].split('_')[0].split('/')[1]))
         
     | 
| 321 | 
         
            -
                    if int(temp[-2:]) < 40:
         
     | 
| 322 | 
         
            -
                        temp = temp[0:-2] + ' et al. 20' + temp[-2:]
         
     | 
| 323 | 
         
            -
                    else:
         
     | 
| 324 | 
         
            -
                        temp = temp[0:-2] + ' et al. 19' + temp[-2:]
         
     | 
| 325 | 
         
            -
                    temp = '['+temp+']('+all_links[int(srcnames[i].split('_')[0].split('/')[1])]+')'
         
     | 
| 326 | 
         
            -
                    st.markdown(temp)
         
     | 
| 327 | 
         
            -
                abs_indices = np.array(srcindices)
         
     | 
| 328 | 
         
            -
             
     | 
| 329 | 
         
            -
                fig = plt.figure(figsize=(9,9))
         
     | 
| 330 | 
         
            -
                plt.scatter(e2d[0:,0], e2d[0:,1],s=2)
         
     | 
| 331 | 
         
            -
                plt.scatter(e2d[simids,0], e2d[simids,1],s=30)
         
     | 
| 332 | 
         
            -
                plt.scatter(e2d[abs_indices,0], e2d[abs_indices,1],s=100,color='k',marker='d')
         
     | 
| 333 | 
         
            -
                plt.title('localization for question: '+query)
         
     | 
| 334 | 
         
            -
                st.pyplot(fig)
         
     | 
| 335 | 
         
            -
             
     | 
| 336 | 
         
            -
                st.markdown('\n #### List of relevant papers:')
         
     | 
| 337 | 
         
            -
                st.markdown(sims)
         
     | 
| 338 | 
         
            -
             
     | 
| 339 | 
         
            -
                return rag_answer
         
     | 
| 340 | 
         
            -
             
     | 
| 341 | 
         
            -
             
     | 
| 342 | 
         
            -
            st.title('ArXiv-based question answering')
         
     | 
| 343 | 
         
            -
            st.markdown('[Includes papers up to: `'+dateval+'`]')
         
     | 
| 344 | 
         
            -
            st.markdown('Concise answers for questions using arxiv abstracts + GPT-4. You might need to wait for a few seconds for the GPT-4 query to return an answer (check top right corner to see if it is still running).')
         
     | 
| 345 | 
         
            -
            st.markdown('The answers are followed by relevant source(s) used in the answer, a graph showing which part of the astro-ph.GA manifold it drew the answer from (tightly clustered points generally indicate high quality/consensus answers) followed by a bunch of relevant papers used by the RAG to compose the answer.')
         
     | 
| 346 | 
         
            -
            st.markdown('If this does not satisfactorily answer your question or rambles too much, you can also try the older `qa_sources_v1` page.')
         
     | 
| 347 | 
         
            -
             
     | 
| 348 | 
         
            -
            query = st.text_input('Your question here:',
         
     | 
| 349 | 
         
            -
            value="What causes galaxy quenching at high redshifts?")
         
     | 
| 350 | 
         
            -
            return_n = st.slider('How many papers should I show?', 1, 30, 10)
         
     | 
| 351 | 
         
            -
             
     | 
| 352 | 
         
            -
            sims = run_rag(query, return_n = return_n)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
    	
        pages/4_author_search.py
    DELETED
    
    | 
         @@ -1,138 +0,0 @@ 
     | 
|
| 1 | 
         
            -
            import os
         
     | 
| 2 | 
         
            -
            import datetime
         
     | 
| 3 | 
         
            -
            import faiss
         
     | 
| 4 | 
         
            -
            import streamlit as st
         
     | 
| 5 | 
         
            -
            import feedparser
         
     | 
| 6 | 
         
            -
            import urllib
         
     | 
| 7 | 
         
            -
            import cloudpickle as cp
         
     | 
| 8 | 
         
            -
            import pickle
         
     | 
| 9 | 
         
            -
            from urllib.request import urlopen
         
     | 
| 10 | 
         
            -
            from summa import summarizer
         
     | 
| 11 | 
         
            -
            import numpy as np
         
     | 
| 12 | 
         
            -
            import matplotlib.pyplot as plt
         
     | 
| 13 | 
         
            -
            import requests
         
     | 
| 14 | 
         
            -
            import json
         
     | 
| 15 | 
         
            -
             
     | 
| 16 | 
         
            -
            from langchain_openai import AzureOpenAIEmbeddings
         
     | 
| 17 | 
         
            -
            from langchain.llms import OpenAI
         
     | 
| 18 | 
         
            -
            from langchain_openai import AzureChatOpenAI
         
     | 
| 19 | 
         
            -
             
     | 
| 20 | 
         
            -
            os.environ["OPENAI_API_TYPE"] = "azure"
         
     | 
| 21 | 
         
            -
            os.environ["AZURE_ENDPOINT"] = st.secrets["endpoint1"]
         
     | 
| 22 | 
         
            -
            os.environ["OPENAI_API_KEY"] = st.secrets["key1"]
         
     | 
| 23 | 
         
            -
            os.environ["OPENAI_API_VERSION"] = "2023-05-15"
         
     | 
| 24 | 
         
            -
             
     | 
| 25 | 
         
            -
            embeddings = AzureOpenAIEmbeddings(
         
     | 
| 26 | 
         
            -
                deployment="embedding",
         
     | 
| 27 | 
         
            -
                model="text-embedding-ada-002",
         
     | 
| 28 | 
         
            -
                azure_endpoint=st.secrets["endpoint1"],
         
     | 
| 29 | 
         
            -
            )
         
     | 
| 30 | 
         
            -
             
     | 
| 31 | 
         
            -
            llm = AzureChatOpenAI(
         
     | 
| 32 | 
         
            -
                    deployment_name="gpt4_small",
         
     | 
| 33 | 
         
            -
                    openai_api_version="2023-12-01-preview",
         
     | 
| 34 | 
         
            -
                    azure_endpoint=st.secrets["endpoint2"],
         
     | 
| 35 | 
         
            -
                    openai_api_key=st.secrets["key2"],
         
     | 
| 36 | 
         
            -
                    openai_api_type="azure",
         
     | 
| 37 | 
         
            -
                    temperature=0.
         
     | 
| 38 | 
         
            -
                )
         
     | 
| 39 | 
         
            -
             
     | 
| 40 | 
         
            -
             
     | 
| 41 | 
         
            -
            @st.cache_data
         
     | 
| 42 | 
         
            -
            def get_feeds_data(url):
         
     | 
| 43 | 
         
            -
                # data = cp.load(urlopen(url))
         
     | 
| 44 | 
         
            -
                with open(url, "rb") as fp:
         
     | 
| 45 | 
         
            -
                    data = pickle.load(fp)
         
     | 
| 46 | 
         
            -
                st.sidebar.success("Loaded data")
         
     | 
| 47 | 
         
            -
                return data
         
     | 
| 48 | 
         
            -
             
     | 
| 49 | 
         
            -
            # feeds_link = "https://drive.google.com/uc?export=download&id=1-IPk1voyUM9VqnghwyVrM1dY6rFnn1S_"
         
     | 
| 50 | 
         
            -
            # embed_link = "https://dl.dropboxusercontent.com/s/ob2betm29qrtb8v/astro_ph_ga_feeds_ada_embedding_18-Apr-2023.pkl?dl=0"
         
     | 
| 51 | 
         
            -
            dateval = "27-Jun-2023"
         
     | 
| 52 | 
         
            -
            feeds_link = "local_files/astro_ph_ga_feeds_upto_"+dateval+".pkl"
         
     | 
| 53 | 
         
            -
            embed_link = "local_files/astro_ph_ga_feeds_ada_embedding_"+dateval+".pkl"
         
     | 
| 54 | 
         
            -
            gal_feeds = get_feeds_data(feeds_link)
         
     | 
| 55 | 
         
            -
            arxiv_ada_embeddings = get_feeds_data(embed_link)
         
     | 
| 56 | 
         
            -
             
     | 
| 57 | 
         
            -
            @st.cache_data
         
     | 
| 58 | 
         
            -
            def get_embedding_data(url):
         
     | 
| 59 | 
         
            -
                # data = cp.load(urlopen(url))
         
     | 
| 60 | 
         
            -
                with open(url, "rb") as fp:
         
     | 
| 61 | 
         
            -
                    data = pickle.load(fp)
         
     | 
| 62 | 
         
            -
                st.sidebar.success("Fetched data from API!")
         
     | 
| 63 | 
         
            -
                return data
         
     | 
| 64 | 
         
            -
             
     | 
| 65 | 
         
            -
            # url = "https://drive.google.com/uc?export=download&id=1133tynMwsfdR1wxbkFLhbES3FwDWTPjP"
         
     | 
| 66 | 
         
            -
            url = "local_files/astro_ph_ga_embedding_"+dateval+".pkl"
         
     | 
| 67 | 
         
            -
            e2d = get_embedding_data(url)
         
     | 
| 68 | 
         
            -
            # e2d, _, _, _, _ = get_embedding_data(url)
         
     | 
| 69 | 
         
            -
             
     | 
| 70 | 
         
            -
            ctr = -1
         
     | 
| 71 | 
         
            -
            num_chunks = len(gal_feeds)
         
     | 
| 72 | 
         
            -
            ctr = -1
         
     | 
| 73 | 
         
            -
            num_chunks = len(gal_feeds)
         
     | 
| 74 | 
         
            -
            all_text, all_titles, all_arxivid, all_links, all_authors, all_pubdates, all_old = [], [], [], [], [], [], []
         
     | 
| 75 | 
         
            -
             
     | 
| 76 | 
         
            -
            for nc in range(num_chunks):
         
     | 
| 77 | 
         
            -
             
     | 
| 78 | 
         
            -
                for i in range(len(gal_feeds[nc].entries)):
         
     | 
| 79 | 
         
            -
                    text = gal_feeds[nc].entries[i].summary
         
     | 
| 80 | 
         
            -
                    text = text.replace('\n', ' ')
         
     | 
| 81 | 
         
            -
                    text = text.replace('\\', '')
         
     | 
| 82 | 
         
            -
                    all_text.append(text)
         
     | 
| 83 | 
         
            -
                    all_titles.append(gal_feeds[nc].entries[i].title)
         
     | 
| 84 | 
         
            -
                    all_arxivid.append(gal_feeds[nc].entries[i].id.split('/')[-1][0:-2])
         
     | 
| 85 | 
         
            -
                    all_links.append(gal_feeds[nc].entries[i].links[1].href)
         
     | 
| 86 | 
         
            -
                    all_authors.append(gal_feeds[nc].entries[i].authors)
         
     | 
| 87 | 
         
            -
                    temp = gal_feeds[nc].entries[i].published
         
     | 
| 88 | 
         
            -
                    datetime_object = datetime.datetime.strptime(temp[0:10]+' '+temp[11:-1], '%Y-%m-%d %H:%M:%S')
         
     | 
| 89 | 
         
            -
                    all_pubdates.append(datetime_object)
         
     | 
| 90 | 
         
            -
                    all_old.append((datetime.datetime.now() - datetime_object).days)
         
     | 
| 91 | 
         
            -
             
     | 
| 92 | 
         
            -
            def make_author_plot(inputstr, print_summary = False):
         
     | 
| 93 | 
         
            -
             
     | 
| 94 | 
         
            -
                authr_list = inputstr.split(', ')
         
     | 
| 95 | 
         
            -
                author_flag = np.zeros((len(all_authors),))
         
     | 
| 96 | 
         
            -
                ctr = 0
         
     | 
| 97 | 
         
            -
                pts = []
         
     | 
| 98 | 
         
            -
                for i in range(len(all_authors)):
         
     | 
| 99 | 
         
            -
                    for j in range(len(all_authors[i])):
         
     | 
| 100 | 
         
            -
                        for k in range(len(authr_list)):
         
     | 
| 101 | 
         
            -
                            authr = authr_list[k]
         
     | 
| 102 | 
         
            -
                            if authr.lower() in all_authors[i][j]['name'].lower():
         
     | 
| 103 | 
         
            -
                                author_flag[i] = 1
         
     | 
| 104 | 
         
            -
                                ctr = ctr+1
         
     | 
| 105 | 
         
            -
                                printstr = str(ctr)+'. [age= %.1f yr, x: %.1f, y: %.1f]' %(all_old[i]/365,e2d[i,0], e2d[i,1])+' name: '+all_authors[i][j]['name']
         
     | 
| 106 | 
         
            -
                                pts.append(printstr)
         
     | 
| 107 | 
         
            -
                                pts.append('Paper title: ' + all_titles[i])
         
     | 
| 108 | 
         
            -
                            else:
         
     | 
| 109 | 
         
            -
                                continue
         
     | 
| 110 | 
         
            -
                print(np.sum(author_flag))
         
     | 
| 111 | 
         
            -
                author_flag = author_flag.astype(bool)
         
     | 
| 112 | 
         
            -
             
     | 
| 113 | 
         
            -
                fig = plt.figure(figsize=(10.8,9.))
         
     | 
| 114 | 
         
            -
                plt.scatter(e2d[0:,0], e2d[0:,1],s=1,color='k',alpha=0.3)
         
     | 
| 115 | 
         
            -
                plt.scatter(e2d[0:,0][author_flag], e2d[0:,1][author_flag],
         
     | 
| 116 | 
         
            -
                            s=100,c=np.array(all_old)[author_flag]/365,alpha=1.0, cmap='coolwarm')
         
     | 
| 117 | 
         
            -
                clbr = plt.colorbar(); clbr.set_label('lookback time [years]',fontsize=18)
         
     | 
| 118 | 
         
            -
                tempx = plt.xlim(); tempy = plt.ylim()
         
     | 
| 119 | 
         
            -
                plt.title('Author: '+authr,fontsize=18,fontweight='bold')
         
     | 
| 120 | 
         
            -
                st.pyplot(fig)
         
     | 
| 121 | 
         
            -
             
     | 
| 122 | 
         
            -
                if print_summary == True:
         
     | 
| 123 | 
         
            -
                    st.markdown('---')
         
     | 
| 124 | 
         
            -
                    for i in range(len(pts)):
         
     | 
| 125 | 
         
            -
                        st.markdown(pts[i])
         
     | 
| 126 | 
         
            -
             
     | 
| 127 | 
         
            -
                return
         
     | 
| 128 | 
         
            -
             
     | 
| 129 | 
         
            -
             
     | 
| 130 | 
         
            -
            st.title('Author search')
         
     | 
| 131 | 
         
            -
            st.markdown('[Includes papers up to: `'+dateval+'`]')
         
     | 
| 132 | 
         
            -
            st.markdown('Trace the location and trajectory of a researcher in the astro-ph.GA manifold.')
         
     | 
| 133 | 
         
            -
            st.markdown('The current text matching is exact (not case sensitive), so look at the printed summaries below to refine your input string. If you have multiple aliases by which you publish, separate the inputs with a comma followed by a space like in the example below.')
         
     | 
| 134 | 
         
            -
             
     | 
| 135 | 
         
            -
            query = st.text_input('Author name:',
         
     | 
| 136 | 
         
            -
            value="Kartheik Iyer, Kartheik G. Iyer, K. G. Iyer")
         
     | 
| 137 | 
         
            -
             
     | 
| 138 | 
         
            -
            make_author_plot(query, print_summary=True)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
    	
        pages/5_research_hotspots.py
    DELETED
    
    | 
         @@ -1,130 +0,0 @@ 
     | 
|
| 1 | 
         
            -
            import os
         
     | 
| 2 | 
         
            -
            import datetime
         
     | 
| 3 | 
         
            -
            import faiss
         
     | 
| 4 | 
         
            -
            import streamlit as st
         
     | 
| 5 | 
         
            -
            import feedparser
         
     | 
| 6 | 
         
            -
            import urllib
         
     | 
| 7 | 
         
            -
            import cloudpickle as cp
         
     | 
| 8 | 
         
            -
            import pickle
         
     | 
| 9 | 
         
            -
            from urllib.request import urlopen
         
     | 
| 10 | 
         
            -
            from summa import summarizer
         
     | 
| 11 | 
         
            -
            import numpy as np
         
     | 
| 12 | 
         
            -
            import matplotlib.pyplot as plt
         
     | 
| 13 | 
         
            -
            import requests
         
     | 
| 14 | 
         
            -
            import json
         
     | 
| 15 | 
         
            -
            from scipy import ndimage
         
     | 
| 16 | 
         
            -
             
     | 
| 17 | 
         
            -
            from langchain_openai import AzureOpenAIEmbeddings
         
     | 
| 18 | 
         
            -
            from langchain.llms import OpenAI
         
     | 
| 19 | 
         
            -
            from langchain_openai import AzureChatOpenAI
         
     | 
| 20 | 
         
            -
             
     | 
| 21 | 
         
            -
            os.environ["OPENAI_API_TYPE"] = "azure"
         
     | 
| 22 | 
         
            -
            os.environ["AZURE_ENDPOINT"] = st.secrets["endpoint1"]
         
     | 
| 23 | 
         
            -
            os.environ["OPENAI_API_KEY"] = st.secrets["key1"]
         
     | 
| 24 | 
         
            -
            os.environ["OPENAI_API_VERSION"] = "2023-05-15"
         
     | 
| 25 | 
         
            -
             
     | 
| 26 | 
         
            -
            embeddings = AzureOpenAIEmbeddings(
         
     | 
| 27 | 
         
            -
                deployment="embedding",
         
     | 
| 28 | 
         
            -
                model="text-embedding-ada-002",
         
     | 
| 29 | 
         
            -
                azure_endpoint=st.secrets["endpoint1"],
         
     | 
| 30 | 
         
            -
            )
         
     | 
| 31 | 
         
            -
             
     | 
| 32 | 
         
            -
            llm = AzureChatOpenAI(
         
     | 
| 33 | 
         
            -
                    deployment_name="gpt4_small",
         
     | 
| 34 | 
         
            -
                    openai_api_version="2023-12-01-preview",
         
     | 
| 35 | 
         
            -
                    azure_endpoint=st.secrets["endpoint2"],
         
     | 
| 36 | 
         
            -
                    openai_api_key=st.secrets["key2"],
         
     | 
| 37 | 
         
            -
                    openai_api_type="azure",
         
     | 
| 38 | 
         
            -
                    temperature=0.
         
     | 
| 39 | 
         
            -
                )
         
     | 
| 40 | 
         
            -
             
     | 
| 41 | 
         
            -
             
     | 
| 42 | 
         
            -
            @st.cache_data
         
     | 
| 43 | 
         
            -
            def get_feeds_data(url):
         
     | 
| 44 | 
         
            -
                # data = cp.load(urlopen(url))
         
     | 
| 45 | 
         
            -
                with open(url, "rb") as fp:
         
     | 
| 46 | 
         
            -
                    data = pickle.load(fp)
         
     | 
| 47 | 
         
            -
                st.sidebar.success("Loaded data")
         
     | 
| 48 | 
         
            -
                return data
         
     | 
| 49 | 
         
            -
             
     | 
| 50 | 
         
            -
            # feeds_link = "https://drive.google.com/uc?export=download&id=1-IPk1voyUM9VqnghwyVrM1dY6rFnn1S_"
         
     | 
| 51 | 
         
            -
            # embed_link = "https://dl.dropboxusercontent.com/s/ob2betm29qrtb8v/astro_ph_ga_feeds_ada_embedding_18-Apr-2023.pkl?dl=0"
         
     | 
| 52 | 
         
            -
            dateval = "27-Jun-2023"
         
     | 
| 53 | 
         
            -
            feeds_link = "local_files/astro_ph_ga_feeds_upto_"+dateval+".pkl"
         
     | 
| 54 | 
         
            -
            embed_link = "local_files/astro_ph_ga_feeds_ada_embedding_"+dateval+".pkl"
         
     | 
| 55 | 
         
            -
            gal_feeds = get_feeds_data(feeds_link)
         
     | 
| 56 | 
         
            -
            arxiv_ada_embeddings = get_feeds_data(embed_link)
         
     | 
| 57 | 
         
            -
             
     | 
| 58 | 
         
            -
            @st.cache_data
         
     | 
| 59 | 
         
            -
            def get_embedding_data(url):
         
     | 
| 60 | 
         
            -
                # data = cp.load(urlopen(url))
         
     | 
| 61 | 
         
            -
                with open(url, "rb") as fp:
         
     | 
| 62 | 
         
            -
                    data = pickle.load(fp)
         
     | 
| 63 | 
         
            -
                st.sidebar.success("Fetched data from API!")
         
     | 
| 64 | 
         
            -
                return data
         
     | 
| 65 | 
         
            -
             
     | 
| 66 | 
         
            -
            # url = "https://drive.google.com/uc?export=download&id=1133tynMwsfdR1wxbkFLhbES3FwDWTPjP"
         
     | 
| 67 | 
         
            -
            url = "local_files/astro_ph_ga_embedding_"+dateval+".pkl"
         
     | 
| 68 | 
         
            -
            e2d = get_embedding_data(url)
         
     | 
| 69 | 
         
            -
            # e2d, _, _, _, _ = get_embedding_data(url)
         
     | 
| 70 | 
         
            -
             
     | 
| 71 | 
         
            -
            ctr = -1
         
     | 
| 72 | 
         
            -
            num_chunks = len(gal_feeds)
         
     | 
| 73 | 
         
            -
            ctr = -1
         
     | 
| 74 | 
         
            -
            num_chunks = len(gal_feeds)
         
     | 
| 75 | 
         
            -
            all_text, all_titles, all_arxivid, all_links, all_authors, all_pubdates, all_old = [], [], [], [], [], [], []
         
     | 
| 76 | 
         
            -
             
     | 
| 77 | 
         
            -
            for nc in range(num_chunks):
         
     | 
| 78 | 
         
            -
             
     | 
| 79 | 
         
            -
                for i in range(len(gal_feeds[nc].entries)):
         
     | 
| 80 | 
         
            -
                    text = gal_feeds[nc].entries[i].summary
         
     | 
| 81 | 
         
            -
                    text = text.replace('\n', ' ')
         
     | 
| 82 | 
         
            -
                    text = text.replace('\\', '')
         
     | 
| 83 | 
         
            -
                    all_text.append(text)
         
     | 
| 84 | 
         
            -
                    all_titles.append(gal_feeds[nc].entries[i].title)
         
     | 
| 85 | 
         
            -
                    all_arxivid.append(gal_feeds[nc].entries[i].id.split('/')[-1][0:-2])
         
     | 
| 86 | 
         
            -
                    all_links.append(gal_feeds[nc].entries[i].links[1].href)
         
     | 
| 87 | 
         
            -
                    all_authors.append(gal_feeds[nc].entries[i].authors)
         
     | 
| 88 | 
         
            -
                    temp = gal_feeds[nc].entries[i].published
         
     | 
| 89 | 
         
            -
                    datetime_object = datetime.datetime.strptime(temp[0:10]+' '+temp[11:-1], '%Y-%m-%d %H:%M:%S')
         
     | 
| 90 | 
         
            -
                    all_pubdates.append(datetime_object)
         
     | 
| 91 | 
         
            -
                    all_old.append((datetime.datetime.now() - datetime_object).days)
         
     | 
| 92 | 
         
            -
             
     | 
| 93 | 
         
            -
            def make_time_excess_plot(midage = 0, tolage = 1, onlyolder = False):
         
     | 
| 94 | 
         
            -
             
     | 
| 95 | 
         
            -
                bw = 0.05
         
     | 
| 96 | 
         
            -
                sigma = 4.0
         
     | 
| 97 | 
         
            -
                mask = (np.abs(np.array(all_old) - midage*365) < tolage*365)
         
     | 
| 98 | 
         
            -
             
     | 
| 99 | 
         
            -
                if onlyolder == True:
         
     | 
| 100 | 
         
            -
                    mask2 = (np.array(all_old) > midage*365 + tolage*365/2)
         
     | 
| 101 | 
         
            -
                    a = np.histogram2d(e2d[0:,0][mask2], e2d[0:,1][mask2], bins=(np.arange(0,17,bw)), density=True)
         
     | 
| 102 | 
         
            -
                else:
         
     | 
| 103 | 
         
            -
                    a = np.histogram2d(e2d[0:,0], e2d[0:,1], bins=(np.arange(0,17,bw)), density=True)
         
     | 
| 104 | 
         
            -
                b = np.histogram2d(e2d[0:,0][mask], e2d[0:,1][mask], bins=(np.arange(0,17,bw)), density=True)
         
     | 
| 105 | 
         
            -
                temp = b[0].T - a[0].T
         
     | 
| 106 | 
         
            -
                temp = ndimage.gaussian_filter(temp, sigma, mode='nearest')
         
     | 
| 107 | 
         
            -
                vscale = (np.nanpercentile(temp,99.5) - np.nanpercentile(temp,0.5))/2
         
     | 
| 108 | 
         
            -
             
     | 
| 109 | 
         
            -
                fig, ax = plt.subplots(1,1,figsize=(11,9))
         
     | 
| 110 | 
         
            -
                plt.pcolor(a[1][0:-1] + (a[1][1]-a[1][0])/2, a[2][0:-1] + (a[2][1]-a[2][0])/2,
         
     | 
| 111 | 
         
            -
                           temp,cmap='bwr',
         
     | 
| 112 | 
         
            -
                           vmin=-vscale,vmax=vscale); plt.colorbar()
         
     | 
| 113 | 
         
            -
                # plt.scatter(e2d[0:,0], e2d[0:,1],s=2,color='k',alpha=0.1)
         
     | 
| 114 | 
         
            -
                plt.title('excess research over the last %.1f yrs centered at %.1f yrs' %(tolage, midage))
         
     | 
| 115 | 
         
            -
                plt.axis([0,14,1,15])
         
     | 
| 116 | 
         
            -
                plt.axis('off')
         
     | 
| 117 | 
         
            -
                st.pyplot(fig)
         
     | 
| 118 | 
         
            -
                return
         
     | 
| 119 | 
         
            -
             
     | 
| 120 | 
         
            -
            st.title('Research hotspots')
         
     | 
| 121 | 
         
            -
            st.markdown('[Includes papers up to: `'+dateval+'`]')
         
     | 
| 122 | 
         
            -
             
     | 
| 123 | 
         
            -
            midage = st.slider('Age', 0., 10., 0.)
         
     | 
| 124 | 
         
            -
            tolage = st.slider('Period width', 0., 10., 1.)
         
     | 
| 125 | 
         
            -
             
     | 
| 126 | 
         
            -
            st.markdown('Compare the research in a given time period to the full manifold.')
         
     | 
| 127 | 
         
            -
            make_time_excess_plot(midage, tolage, onlyolder = False)
         
     | 
| 128 | 
         
            -
             
     | 
| 129 | 
         
            -
            st.markdown('Compare the research in a given time period to research older than that.')
         
     | 
| 130 | 
         
            -
            make_time_excess_plot(midage, tolage, onlyolder = True)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
    	
        pages/6_qa_sources_v1.py
    DELETED
    
    | 
         @@ -1,286 +0,0 @@ 
     | 
|
| 1 | 
         
            -
            import datetime, os
         
     | 
| 2 | 
         
            -
            from langchain.llms import OpenAI
         
     | 
| 3 | 
         
            -
            from langchain.embeddings import OpenAIEmbeddings
         
     | 
| 4 | 
         
            -
            import openai
         
     | 
| 5 | 
         
            -
            import faiss
         
     | 
| 6 | 
         
            -
            import streamlit as st
         
     | 
| 7 | 
         
            -
            import feedparser
         
     | 
| 8 | 
         
            -
            import urllib
         
     | 
| 9 | 
         
            -
            import cloudpickle as cp
         
     | 
| 10 | 
         
            -
            import pickle
         
     | 
| 11 | 
         
            -
            from urllib.request import urlopen
         
     | 
| 12 | 
         
            -
            from summa import summarizer
         
     | 
| 13 | 
         
            -
            import numpy as np
         
     | 
| 14 | 
         
            -
            import matplotlib.pyplot as plt
         
     | 
| 15 | 
         
            -
             
     | 
| 16 | 
         
            -
            import requests
         
     | 
| 17 | 
         
            -
            import json
         
     | 
| 18 | 
         
            -
            from langchain.document_loaders import TextLoader
         
     | 
| 19 | 
         
            -
            from langchain.indexes import VectorstoreIndexCreator
         
     | 
| 20 | 
         
            -
            API_ENDPOINT = "https://api.openai.com/v1/chat/completions"
         
     | 
| 21 | 
         
            -
             
     | 
| 22 | 
         
            -
            # openai.organization = st.secrets.openai.org
         
     | 
| 23 | 
         
            -
            # openai.api_key = st.secrets.openai.api_key
         
     | 
| 24 | 
         
            -
            openai.organization = st.secrets["org"]
         
     | 
| 25 | 
         
            -
            openai.api_key = st.secrets["api_key"]
         
     | 
| 26 | 
         
            -
            os.environ["OPENAI_API_KEY"] = openai.api_key
         
     | 
| 27 | 
         
            -
             
     | 
| 28 | 
         
            -
            @st.cache_data
         
     | 
| 29 | 
         
            -
            def get_feeds_data(url):
         
     | 
| 30 | 
         
            -
                # data = cp.load(urlopen(url))
         
     | 
| 31 | 
         
            -
                with open(url, "rb") as fp:
         
     | 
| 32 | 
         
            -
                    data = pickle.load(fp)
         
     | 
| 33 | 
         
            -
                st.sidebar.success("Loaded data")
         
     | 
| 34 | 
         
            -
                return data
         
     | 
| 35 | 
         
            -
             
     | 
| 36 | 
         
            -
            embeddings = OpenAIEmbeddings()
         
     | 
| 37 | 
         
            -
             
     | 
| 38 | 
         
            -
            # feeds_link = "https://drive.google.com/uc?export=download&id=1-IPk1voyUM9VqnghwyVrM1dY6rFnn1S_"
         
     | 
| 39 | 
         
            -
            # embed_link = "https://dl.dropboxusercontent.com/s/ob2betm29qrtb8v/astro_ph_ga_feeds_ada_embedding_18-Apr-2023.pkl?dl=0"
         
     | 
| 40 | 
         
            -
            dateval = "27-Jun-2023"
         
     | 
| 41 | 
         
            -
            feeds_link = "local_files/astro_ph_ga_feeds_upto_"+dateval+".pkl"
         
     | 
| 42 | 
         
            -
            embed_link = "local_files/astro_ph_ga_feeds_ada_embedding_"+dateval+".pkl"
         
     | 
| 43 | 
         
            -
            gal_feeds = get_feeds_data(feeds_link)
         
     | 
| 44 | 
         
            -
            arxiv_ada_embeddings = get_feeds_data(embed_link)
         
     | 
| 45 | 
         
            -
             
     | 
| 46 | 
         
            -
            @st.cache_data
         
     | 
| 47 | 
         
            -
            def get_embedding_data(url):
         
     | 
| 48 | 
         
            -
                # data = cp.load(urlopen(url))
         
     | 
| 49 | 
         
            -
                with open(url, "rb") as fp:
         
     | 
| 50 | 
         
            -
                    data = pickle.load(fp)
         
     | 
| 51 | 
         
            -
                st.sidebar.success("Fetched data from API!")
         
     | 
| 52 | 
         
            -
                return data
         
     | 
| 53 | 
         
            -
             
     | 
| 54 | 
         
            -
            # url = "https://drive.google.com/uc?export=download&id=1133tynMwsfdR1wxbkFLhbES3FwDWTPjP"
         
     | 
| 55 | 
         
            -
            url = "local_files/astro_ph_ga_embedding_"+dateval+".pkl"
         
     | 
| 56 | 
         
            -
            e2d = get_embedding_data(url)
         
     | 
| 57 | 
         
            -
            # e2d, _, _, _, _ = get_embedding_data(url)
         
     | 
| 58 | 
         
            -
             
     | 
| 59 | 
         
            -
            ctr = -1
         
     | 
| 60 | 
         
            -
            num_chunks = len(gal_feeds)
         
     | 
| 61 | 
         
            -
            all_text, all_titles, all_arxivid, all_links, all_authors = [], [], [], [], []
         
     | 
| 62 | 
         
            -
             
     | 
| 63 | 
         
            -
            for nc in range(num_chunks):
         
     | 
| 64 | 
         
            -
             
     | 
| 65 | 
         
            -
                for i in range(len(gal_feeds[nc].entries)):
         
     | 
| 66 | 
         
            -
                    text = gal_feeds[nc].entries[i].summary
         
     | 
| 67 | 
         
            -
                    text = text.replace('\n', ' ')
         
     | 
| 68 | 
         
            -
                    text = text.replace('\\', '')
         
     | 
| 69 | 
         
            -
                    all_text.append(text)
         
     | 
| 70 | 
         
            -
                    all_titles.append(gal_feeds[nc].entries[i].title)
         
     | 
| 71 | 
         
            -
                    all_arxivid.append(gal_feeds[nc].entries[i].id.split('/')[-1][0:-2])
         
     | 
| 72 | 
         
            -
                    all_links.append(gal_feeds[nc].entries[i].links[1].href)
         
     | 
| 73 | 
         
            -
                    all_authors.append(gal_feeds[nc].entries[i].authors)
         
     | 
| 74 | 
         
            -
             
     | 
| 75 | 
         
            -
            d = arxiv_ada_embeddings.shape[1]                           # dimension
         
     | 
| 76 | 
         
            -
            nb = arxiv_ada_embeddings.shape[0]                      # database size
         
     | 
| 77 | 
         
            -
            xb = arxiv_ada_embeddings.astype('float32')
         
     | 
| 78 | 
         
            -
            index = faiss.IndexFlatL2(d)
         
     | 
| 79 | 
         
            -
            index.add(xb)
         
     | 
| 80 | 
         
            -
             
     | 
| 81 | 
         
            -
            def run_simple_query(search_query = 'all:sed+fitting', max_results = 10, start = 0, sort_by = 'lastUpdatedDate', sort_order = 'descending'):
         
     | 
| 82 | 
         
            -
                """
         
     | 
| 83 | 
         
            -
                    Query ArXiv to return search results for a particular query
         
     | 
| 84 | 
         
            -
                    Parameters
         
     | 
| 85 | 
         
            -
                    ----------
         
     | 
| 86 | 
         
            -
                    query: str
         
     | 
| 87 | 
         
            -
                        query term. use prefixes ti, au, abs, co, jr, cat, m, id, all as applicable.
         
     | 
| 88 | 
         
            -
                    max_results: int, default = 10
         
     | 
| 89 | 
         
            -
                        number of results to return. numbers > 1000 generally lead to timeouts
         
     | 
| 90 | 
         
            -
                    start: int, default = 0
         
     | 
| 91 | 
         
            -
                        start index for results reported. use this if you're interested in running chunks.
         
     | 
| 92 | 
         
            -
                    Returns
         
     | 
| 93 | 
         
            -
                    -------
         
     | 
| 94 | 
         
            -
                    feed: dict
         
     | 
| 95 | 
         
            -
                        object containing requested results parsed with feedparser
         
     | 
| 96 | 
         
            -
                    Notes
         
     | 
| 97 | 
         
            -
                    -----
         
     | 
| 98 | 
         
            -
                        add functionality for chunk parsing, as well as storage and retreival
         
     | 
| 99 | 
         
            -
                    """
         
     | 
| 100 | 
         
            -
             
     | 
| 101 | 
         
            -
                base_url = 'http://export.arxiv.org/api/query?';
         
     | 
| 102 | 
         
            -
                query = 'search_query=%s&start=%i&max_results=%i&sortBy=%s&sortOrder=%s' % (search_query,
         
     | 
| 103 | 
         
            -
                                                                 start,
         
     | 
| 104 | 
         
            -
                                                                 max_results,sort_by,sort_order)
         
     | 
| 105 | 
         
            -
             
     | 
| 106 | 
         
            -
                response = urllib.request.urlopen(base_url+query).read()
         
     | 
| 107 | 
         
            -
                feed = feedparser.parse(response)
         
     | 
| 108 | 
         
            -
                return feed
         
     | 
| 109 | 
         
            -
             
     | 
| 110 | 
         
            -
            def find_papers_by_author(auth_name):
         
     | 
| 111 | 
         
            -
             
     | 
| 112 | 
         
            -
                doc_ids = []
         
     | 
| 113 | 
         
            -
                for doc_id in range(len(all_authors)):
         
     | 
| 114 | 
         
            -
                    for auth_id in range(len(all_authors[doc_id])):
         
     | 
| 115 | 
         
            -
                        if auth_name.lower() in all_authors[doc_id][auth_id]['name'].lower():
         
     | 
| 116 | 
         
            -
                            print('Doc ID: ',doc_id, ' | arXiv: ', all_arxivid[doc_id], '| ', all_titles[doc_id],' | Author entry: ', all_authors[doc_id][auth_id]['name'])
         
     | 
| 117 | 
         
            -
                            doc_ids.append(doc_id)
         
     | 
| 118 | 
         
            -
             
     | 
| 119 | 
         
            -
                return doc_ids
         
     | 
| 120 | 
         
            -
             
     | 
| 121 | 
         
            -
            def faiss_based_indices(input_vector, nindex=10, yrmin = 1990, yrmax = 2024):
         
     | 
| 122 | 
         
            -
                xq = input_vector.reshape(-1,1).T.astype('float32')
         
     | 
| 123 | 
         
            -
                D, I = index.search(xq, nindex)
         
     | 
| 124 | 
         
            -
                return I[0], D[0]
         
     | 
| 125 | 
         
            -
             
     | 
| 126 | 
         
            -
            def list_similar_papers_v2(model_data,
         
     | 
| 127 | 
         
            -
                                    doc_id = [], input_type = 'doc_id',
         
     | 
| 128 | 
         
            -
                                    show_authors = False, show_summary = False,
         
     | 
| 129 | 
         
            -
                                    return_n = 10, yrmin = 1990, yrmax = 2024):
         
     | 
| 130 | 
         
            -
             
     | 
| 131 | 
         
            -
                arxiv_ada_embeddings, embeddings, all_titles, all_abstracts, all_authors = model_data
         
     | 
| 132 | 
         
            -
             
     | 
| 133 | 
         
            -
                if input_type == 'doc_id':
         
     | 
| 134 | 
         
            -
                    print('Doc ID: ',doc_id,', title: ',all_titles[doc_id])
         
     | 
| 135 | 
         
            -
            #         inferred_vector = model.infer_vector(train_corpus[doc_id].words)
         
     | 
| 136 | 
         
            -
                    inferred_vector = arxiv_ada_embeddings[doc_id,0:]
         
     | 
| 137 | 
         
            -
                    start_range = 1
         
     | 
| 138 | 
         
            -
                elif input_type == 'arxiv_id':
         
     | 
| 139 | 
         
            -
                    print('ArXiv id: ',doc_id)
         
     | 
| 140 | 
         
            -
                    arxiv_query_feed = run_simple_query(search_query='id:'+str(doc_id))
         
     | 
| 141 | 
         
            -
                    if len(arxiv_query_feed.entries) == 0:
         
     | 
| 142 | 
         
            -
                        print('error: arxiv id not found.')
         
     | 
| 143 | 
         
            -
                        return
         
     | 
| 144 | 
         
            -
                    else:
         
     | 
| 145 | 
         
            -
                        print('Title: '+arxiv_query_feed.entries[0].title)
         
     | 
| 146 | 
         
            -
                        inferred_vector = np.array(embeddings.embed_query(arxiv_query_feed.entries[0].summary))
         
     | 
| 147 | 
         
            -
                    start_range = 0
         
     | 
| 148 | 
         
            -
                elif input_type == 'keywords':
         
     | 
| 149 | 
         
            -
                    inferred_vector = np.array(embeddings.embed_query(doc_id))
         
     | 
| 150 | 
         
            -
                    start_range = 0
         
     | 
| 151 | 
         
            -
                else:
         
     | 
| 152 | 
         
            -
                    print('unrecognized input type.')
         
     | 
| 153 | 
         
            -
                    return
         
     | 
| 154 | 
         
            -
             
     | 
| 155 | 
         
            -
                sims, dists = faiss_based_indices(inferred_vector, return_n+2, yrmin = 1990, yrmax = 2024)
         
     | 
| 156 | 
         
            -
                textstr = ''
         
     | 
| 157 | 
         
            -
                abstracts_relevant = []
         
     | 
| 158 | 
         
            -
                fhdrs = []
         
     | 
| 159 | 
         
            -
             
     | 
| 160 | 
         
            -
                for i in range(start_range,start_range+return_n):
         
     | 
| 161 | 
         
            -
             
     | 
| 162 | 
         
            -
                    abstracts_relevant.append(all_text[sims[i]])
         
     | 
| 163 | 
         
            -
                    fhdr = all_authors[sims[i]][0]['name'].split()[-1] + all_arxivid[sims[i]][0:2] +'_'+ all_arxivid[sims[i]]
         
     | 
| 164 | 
         
            -
                    fhdrs.append(fhdr)
         
     | 
| 165 | 
         
            -
                    textstr = textstr + str(i+1)+'. **'+ all_titles[sims[i]] +'** (Distance: %.2f' %dists[i]+')   \n'
         
     | 
| 166 | 
         
            -
                    textstr = textstr + '**ArXiv:** ['+all_arxivid[sims[i]]+'](https://arxiv.org/abs/'+all_arxivid[sims[i]]+')  \n'
         
     | 
| 167 | 
         
            -
                    if show_authors == True:
         
     | 
| 168 | 
         
            -
                        textstr = textstr + '**Authors:**  '
         
     | 
| 169 | 
         
            -
                        temp = all_authors[sims[i]]
         
     | 
| 170 | 
         
            -
                        for ak in range(len(temp)):
         
     | 
| 171 | 
         
            -
                            if ak < len(temp)-1:
         
     | 
| 172 | 
         
            -
                                textstr = textstr + temp[ak].name + ', '
         
     | 
| 173 | 
         
            -
                            else:
         
     | 
| 174 | 
         
            -
                                textstr = textstr + temp[ak].name + '   \n'
         
     | 
| 175 | 
         
            -
                    if show_summary == True:
         
     | 
| 176 | 
         
            -
                        textstr = textstr + '**Summary:**  '
         
     | 
| 177 | 
         
            -
                        text = all_text[sims[i]]
         
     | 
| 178 | 
         
            -
                        text = text.replace('\n', ' ')
         
     | 
| 179 | 
         
            -
                        textstr = textstr + summarizer.summarize(text) + '  \n'
         
     | 
| 180 | 
         
            -
                    if show_authors == True or show_summary == True:
         
     | 
| 181 | 
         
            -
                        textstr = textstr + ' '
         
     | 
| 182 | 
         
            -
                    textstr = textstr + '  \n'
         
     | 
| 183 | 
         
            -
                return textstr, abstracts_relevant, fhdrs, sims
         
     | 
| 184 | 
         
            -
             
     | 
| 185 | 
         
            -
            model_data = [arxiv_ada_embeddings, embeddings, all_titles, all_text, all_authors]
         
     | 
| 186 | 
         
            -
             
     | 
| 187 | 
         
            -
            def run_query(query, return_n = 3, yrmin = 1990, yrmax = 2024, show_pure_answer = False, show_all_sources = True):
         
     | 
| 188 | 
         
            -
             
     | 
| 189 | 
         
            -
                show_authors = True
         
     | 
| 190 | 
         
            -
                show_summary = True
         
     | 
| 191 | 
         
            -
                sims, absts, fhdrs, simids = list_similar_papers_v2(model_data,
         
     | 
| 192 | 
         
            -
                                              doc_id = query,
         
     | 
| 193 | 
         
            -
                                              input_type='keywords',
         
     | 
| 194 | 
         
            -
                                              show_authors = show_authors, show_summary = show_summary,
         
     | 
| 195 | 
         
            -
                                              return_n = return_n, yrmin = 1990, yrmax = 2024)
         
     | 
| 196 | 
         
            -
             
     | 
| 197 | 
         
            -
                temp_abst = ''
         
     | 
| 198 | 
         
            -
                loaders = []
         
     | 
| 199 | 
         
            -
                for i in range(len(absts)):
         
     | 
| 200 | 
         
            -
                    temp_abst = absts[i]
         
     | 
| 201 | 
         
            -
             
     | 
| 202 | 
         
            -
                    try:
         
     | 
| 203 | 
         
            -
                        text_file = open("absts/"+fhdrs[i]+".txt", "w")
         
     | 
| 204 | 
         
            -
                    except:
         
     | 
| 205 | 
         
            -
                        os.mkdir('absts')
         
     | 
| 206 | 
         
            -
                        text_file = open("absts/"+fhdrs[i]+".txt", "w")
         
     | 
| 207 | 
         
            -
                    n = text_file.write(temp_abst)
         
     | 
| 208 | 
         
            -
                    text_file.close()
         
     | 
| 209 | 
         
            -
                    loader = TextLoader("absts/"+fhdrs[i]+".txt")
         
     | 
| 210 | 
         
            -
                    loaders.append(loader)
         
     | 
| 211 | 
         
            -
             
     | 
| 212 | 
         
            -
                lc_index = VectorstoreIndexCreator().from_loaders(loaders)
         
     | 
| 213 | 
         
            -
             
     | 
| 214 | 
         
            -
                st.markdown('### User query: '+query)
         
     | 
| 215 | 
         
            -
                if show_pure_answer == True:
         
     | 
| 216 | 
         
            -
                    st.markdown('pure answer:')
         
     | 
| 217 | 
         
            -
                    st.markdown(lc_index.query(query))
         
     | 
| 218 | 
         
            -
                    st.markdown(' ')
         
     | 
| 219 | 
         
            -
                st.markdown('#### context-based answer from sources:')
         
     | 
| 220 | 
         
            -
                output = lc_index.query_with_sources(query + ' Let\'s work this out in a step by step way to be sure we have the right answer.' ) #zero-shot in-context prompting from Zhou+22, Kojima+22
         
     | 
| 221 | 
         
            -
                st.markdown(output['answer'])
         
     | 
| 222 | 
         
            -
                opstr = '#### Primary sources: \n'
         
     | 
| 223 | 
         
            -
                st.markdown(opstr)
         
     | 
| 224 | 
         
            -
             
     | 
| 225 | 
         
            -
            #     opstr = ''
         
     | 
| 226 | 
         
            -
            #     for i in range(len(output['sources'])):
         
     | 
| 227 | 
         
            -
            #         opstr = opstr +'\n'+ output['sources'][i]
         
     | 
| 228 | 
         
            -
             
     | 
| 229 | 
         
            -
                textstr = ''
         
     | 
| 230 | 
         
            -
                ng = len(output['sources'].split())
         
     | 
| 231 | 
         
            -
                abs_indices = []
         
     | 
| 232 | 
         
            -
             
     | 
| 233 | 
         
            -
                for i in range(ng):
         
     | 
| 234 | 
         
            -
                    if i == (ng-1):
         
     | 
| 235 | 
         
            -
                        tempid = output['sources'].split()[i].split('_')[1][0:-4]
         
     | 
| 236 | 
         
            -
                    else:
         
     | 
| 237 | 
         
            -
                        tempid = output['sources'].split()[i].split('_')[1][0:-5]
         
     | 
| 238 | 
         
            -
                    try:
         
     | 
| 239 | 
         
            -
                        abs_index = all_arxivid.index(tempid)
         
     | 
| 240 | 
         
            -
                        abs_indices.append(abs_index)
         
     | 
| 241 | 
         
            -
                        textstr = textstr + str(i+1)+'. **'+ all_titles[abs_index] +'   \n'
         
     | 
| 242 | 
         
            -
                        textstr = textstr + '**ArXiv:** ['+all_arxivid[abs_index]+'](https://arxiv.org/abs/'+all_arxivid[abs_index]+')  \n'
         
     | 
| 243 | 
         
            -
                        textstr = textstr + '**Authors:**  '
         
     | 
| 244 | 
         
            -
                        temp = all_authors[abs_index]
         
     | 
| 245 | 
         
            -
                        for ak in range(4):
         
     | 
| 246 | 
         
            -
                            if ak < len(temp)-1:
         
     | 
| 247 | 
         
            -
                                textstr = textstr + temp[ak].name + ', '
         
     | 
| 248 | 
         
            -
                            else:
         
     | 
| 249 | 
         
            -
                                textstr = textstr + temp[ak].name + '   \n'
         
     | 
| 250 | 
         
            -
                        if len(temp) > 3:
         
     | 
| 251 | 
         
            -
                            textstr = textstr + ' et al.    \n'
         
     | 
| 252 | 
         
            -
                        textstr = textstr + '**Summary:**  '
         
     | 
| 253 | 
         
            -
                        text = all_text[abs_index]
         
     | 
| 254 | 
         
            -
                        text = text.replace('\n', ' ')
         
     | 
| 255 | 
         
            -
                        textstr = textstr + summarizer.summarize(text) + '  \n'
         
     | 
| 256 | 
         
            -
                    except:
         
     | 
| 257 | 
         
            -
                        textstr = textstr + output['sources'].split()[i]
         
     | 
| 258 | 
         
            -
                    #         opstr = opstr + '  \n ' + output['sources'].split()[i][6:-5].split('_')[0]
         
     | 
| 259 | 
         
            -
                    #     opstr = opstr + '  \n Arxiv id: ' + output['sources'].split()[i][6:-5].split('_')[1]
         
     | 
| 260 | 
         
            -
             
     | 
| 261 | 
         
            -
                    textstr = textstr + ' '
         
     | 
| 262 | 
         
            -
                    textstr = textstr + '  \n'
         
     | 
| 263 | 
         
            -
                st.markdown(textstr)
         
     | 
| 264 | 
         
            -
             
     | 
| 265 | 
         
            -
                fig = plt.figure(figsize=(9,9))
         
     | 
| 266 | 
         
            -
                plt.scatter(e2d[0:,0], e2d[0:,1],s=2)
         
     | 
| 267 | 
         
            -
                plt.scatter(e2d[simids,0], e2d[simids,1],s=30)
         
     | 
| 268 | 
         
            -
                plt.scatter(e2d[abs_indices,0], e2d[abs_indices,1],s=100,color='k',marker='d')
         
     | 
| 269 | 
         
            -
                st.pyplot(fig)
         
     | 
| 270 | 
         
            -
             
     | 
| 271 | 
         
            -
                if show_all_sources == True:
         
     | 
| 272 | 
         
            -
                    st.markdown('\n #### Other interesting papers:')
         
     | 
| 273 | 
         
            -
                    st.markdown(sims)
         
     | 
| 274 | 
         
            -
                return output
         
     | 
| 275 | 
         
            -
             
     | 
| 276 | 
         
            -
            st.title('ArXiv-based question answering')
         
     | 
| 277 | 
         
            -
            st.markdown('[Includes papers up to: `'+dateval+'`]')
         
     | 
| 278 | 
         
            -
            st.markdown('Concise answers for questions using arxiv abstracts + GPT-4. Please use sparingly because it costs me money right now. You might need to wait for a few seconds for the GPT-4 query to return an answer (check top right corner to see if it is still running).')
         
     | 
| 279 | 
         
            -
             
     | 
| 280 | 
         
            -
            query = st.text_input('Your question here:', value="What sersic index does a disk galaxy have?")
         
     | 
| 281 | 
         
            -
            return_n = st.slider('How many papers should I show?', 1, 20, 10)
         
     | 
| 282 | 
         
            -
            yrmin = st.slider('Min year', 1990,2023, 1990)
         
     | 
| 283 | 
         
            -
            yrmax = st.slider('Max year', 1990, 2024, 2024)
         
     | 
| 284 | 
         
            -
             
     | 
| 285 | 
         
            -
             
     | 
| 286 | 
         
            -
            sims = run_query(query, return_n = return_n, yrmin = yrmin, yrmax = yrmax)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
    	
        pages/7_answering_questions_2024.py
    DELETED
    
    | 
         @@ -1,352 +0,0 @@ 
     | 
|
| 1 | 
         
            -
            import os
         
     | 
| 2 | 
         
            -
            import datetime
         
     | 
| 3 | 
         
            -
            import faiss
         
     | 
| 4 | 
         
            -
            import streamlit as st
         
     | 
| 5 | 
         
            -
            import feedparser
         
     | 
| 6 | 
         
            -
            import urllib
         
     | 
| 7 | 
         
            -
            import cloudpickle as cp
         
     | 
| 8 | 
         
            -
            import pickle
         
     | 
| 9 | 
         
            -
            from urllib.request import urlopen
         
     | 
| 10 | 
         
            -
            from summa import summarizer
         
     | 
| 11 | 
         
            -
            import numpy as np
         
     | 
| 12 | 
         
            -
            import matplotlib.pyplot as plt
         
     | 
| 13 | 
         
            -
            import requests
         
     | 
| 14 | 
         
            -
            import json
         
     | 
| 15 | 
         
            -
             
     | 
| 16 | 
         
            -
            from langchain.document_loaders import TextLoader
         
     | 
| 17 | 
         
            -
            from langchain.indexes import VectorstoreIndexCreator
         
     | 
| 18 | 
         
            -
            from langchain_openai import AzureOpenAIEmbeddings
         
     | 
| 19 | 
         
            -
            from langchain.llms import OpenAI
         
     | 
| 20 | 
         
            -
            from langchain_openai import AzureChatOpenAI
         
     | 
| 21 | 
         
            -
            from langchain import hub
         
     | 
| 22 | 
         
            -
            from langchain_core.prompts import PromptTemplate
         
     | 
| 23 | 
         
            -
            from langchain_core.runnables import RunnablePassthrough
         
     | 
| 24 | 
         
            -
            from langchain_core.output_parsers import StrOutputParser
         
     | 
| 25 | 
         
            -
            from langchain_core.runnables import RunnableParallel
         
     | 
| 26 | 
         
            -
            from langchain.text_splitter import RecursiveCharacterTextSplitter
         
     | 
| 27 | 
         
            -
            from langchain_community.vectorstores import Chroma
         
     | 
| 28 | 
         
            -
             
     | 
| 29 | 
         
            -
            os.environ["OPENAI_API_TYPE"] = "azure"
         
     | 
| 30 | 
         
            -
            os.environ["AZURE_ENDPOINT"] = st.secrets["endpoint1"]
         
     | 
| 31 | 
         
            -
            os.environ["OPENAI_API_KEY"] = st.secrets["key1"]
         
     | 
| 32 | 
         
            -
            os.environ["OPENAI_API_VERSION"] = "2023-05-15"
         
     | 
| 33 | 
         
            -
             
     | 
| 34 | 
         
            -
            embeddings = AzureOpenAIEmbeddings(
         
     | 
| 35 | 
         
            -
                deployment="embedding",
         
     | 
| 36 | 
         
            -
                model="text-embedding-ada-002",
         
     | 
| 37 | 
         
            -
                azure_endpoint=st.secrets["endpoint1"],
         
     | 
| 38 | 
         
            -
            )
         
     | 
| 39 | 
         
            -
             
     | 
| 40 | 
         
            -
            llm = AzureChatOpenAI(
         
     | 
| 41 | 
         
            -
                    deployment_name="gpt4_small",
         
     | 
| 42 | 
         
            -
                    openai_api_version="2023-12-01-preview",
         
     | 
| 43 | 
         
            -
                    azure_endpoint=st.secrets["endpoint2"],
         
     | 
| 44 | 
         
            -
                    openai_api_key=st.secrets["key2"],
         
     | 
| 45 | 
         
            -
                    openai_api_type="azure",
         
     | 
| 46 | 
         
            -
                    temperature=0.
         
     | 
| 47 | 
         
            -
                )
         
     | 
| 48 | 
         
            -
             
     | 
| 49 | 
         
            -
             
     | 
| 50 | 
         
            -
            @st.cache_data
         
     | 
| 51 | 
         
            -
            def get_feeds_data(url):
         
     | 
| 52 | 
         
            -
                # data = cp.load(urlopen(url))
         
     | 
| 53 | 
         
            -
                with open(url, "rb") as fp:
         
     | 
| 54 | 
         
            -
                    data = pickle.load(fp)
         
     | 
| 55 | 
         
            -
                st.sidebar.success("Loaded data")
         
     | 
| 56 | 
         
            -
                return data
         
     | 
| 57 | 
         
            -
             
     | 
| 58 | 
         
            -
            # feeds_link = "https://drive.google.com/uc?export=download&id=1-IPk1voyUM9VqnghwyVrM1dY6rFnn1S_"
         
     | 
| 59 | 
         
            -
            # embed_link = "https://dl.dropboxusercontent.com/s/ob2betm29qrtb8v/astro_ph_ga_feeds_ada_embedding_18-Apr-2023.pkl?dl=0"
         
     | 
| 60 | 
         
            -
            dateval = "16-Jun-2024"
         
     | 
| 61 | 
         
            -
            feeds_link = "local_files/astro_ph_ga_feeds_upto_"+dateval+".pkl"
         
     | 
| 62 | 
         
            -
            embed_link = "local_files/astro_ph_ga_feeds_ada_embedding_"+dateval+".pkl"
         
     | 
| 63 | 
         
            -
            gal_feeds = get_feeds_data(feeds_link)
         
     | 
| 64 | 
         
            -
            arxiv_ada_embeddings = get_feeds_data(embed_link)
         
     | 
| 65 | 
         
            -
             
     | 
| 66 | 
         
            -
            @st.cache_data
         
     | 
| 67 | 
         
            -
            def get_embedding_data(url):
         
     | 
| 68 | 
         
            -
                # data = cp.load(urlopen(url))
         
     | 
| 69 | 
         
            -
                with open(url, "rb") as fp:
         
     | 
| 70 | 
         
            -
                    data = pickle.load(fp)
         
     | 
| 71 | 
         
            -
                st.sidebar.success("Fetched data from API!")
         
     | 
| 72 | 
         
            -
                return data
         
     | 
| 73 | 
         
            -
             
     | 
| 74 | 
         
            -
            # url = "https://drive.google.com/uc?export=download&id=1133tynMwsfdR1wxbkFLhbES3FwDWTPjP"
         
     | 
| 75 | 
         
            -
            url = "local_files/astro_ph_ga_embedding_"+dateval+".pkl"
         
     | 
| 76 | 
         
            -
            e2d = get_embedding_data(url)
         
     | 
| 77 | 
         
            -
            # e2d, _, _, _, _ = get_embedding_data(url)
         
     | 
| 78 | 
         
            -
             
     | 
| 79 | 
         
            -
            ctr = -1
         
     | 
| 80 | 
         
            -
            num_chunks = len(gal_feeds)
         
     | 
| 81 | 
         
            -
            all_text, all_titles, all_arxivid, all_links, all_authors = [], [], [], [], []
         
     | 
| 82 | 
         
            -
             
     | 
| 83 | 
         
            -
            for nc in range(num_chunks):
         
     | 
| 84 | 
         
            -
             
     | 
| 85 | 
         
            -
                for i in range(len(gal_feeds[nc].entries)):
         
     | 
| 86 | 
         
            -
                    text = gal_feeds[nc].entries[i].summary
         
     | 
| 87 | 
         
            -
                    text = text.replace('\n', ' ')
         
     | 
| 88 | 
         
            -
                    text = text.replace('\\', '')
         
     | 
| 89 | 
         
            -
                    all_text.append(text)
         
     | 
| 90 | 
         
            -
                    all_titles.append(gal_feeds[nc].entries[i].title)
         
     | 
| 91 | 
         
            -
                    all_arxivid.append(gal_feeds[nc].entries[i].id.split('/')[-1][0:-2])
         
     | 
| 92 | 
         
            -
                    all_links.append(gal_feeds[nc].entries[i].links[1].href)
         
     | 
| 93 | 
         
            -
                    all_authors.append(gal_feeds[nc].entries[i].authors)
         
     | 
| 94 | 
         
            -
             
     | 
| 95 | 
         
            -
            d = arxiv_ada_embeddings.shape[1]                           # dimension
         
     | 
| 96 | 
         
            -
            nb = arxiv_ada_embeddings.shape[0]                      # database size
         
     | 
| 97 | 
         
            -
            xb = arxiv_ada_embeddings.astype('float32')
         
     | 
| 98 | 
         
            -
            index = faiss.IndexFlatL2(d)
         
     | 
| 99 | 
         
            -
            index.add(xb)
         
     | 
| 100 | 
         
            -
             
     | 
| 101 | 
         
            -
            def run_simple_query(search_query = 'all:sed+fitting', max_results = 10, start = 0, sort_by = 'lastUpdatedDate', sort_order = 'descending'):
         
     | 
| 102 | 
         
            -
                """
         
     | 
| 103 | 
         
            -
                    Query ArXiv to return search results for a particular query
         
     | 
| 104 | 
         
            -
                    Parameters
         
     | 
| 105 | 
         
            -
                    ----------
         
     | 
| 106 | 
         
            -
                    query: str
         
     | 
| 107 | 
         
            -
                        query term. use prefixes ti, au, abs, co, jr, cat, m, id, all as applicable.
         
     | 
| 108 | 
         
            -
                    max_results: int, default = 10
         
     | 
| 109 | 
         
            -
                        number of results to return. numbers > 1000 generally lead to timeouts
         
     | 
| 110 | 
         
            -
                    start: int, default = 0
         
     | 
| 111 | 
         
            -
                        start index for results reported. use this if you're interested in running chunks.
         
     | 
| 112 | 
         
            -
                    Returns
         
     | 
| 113 | 
         
            -
                    -------
         
     | 
| 114 | 
         
            -
                    feed: dict
         
     | 
| 115 | 
         
            -
                        object containing requested results parsed with feedparser
         
     | 
| 116 | 
         
            -
                    Notes
         
     | 
| 117 | 
         
            -
                    -----
         
     | 
| 118 | 
         
            -
                        add functionality for chunk parsing, as well as storage and retreival
         
     | 
| 119 | 
         
            -
                    """
         
     | 
| 120 | 
         
            -
             
     | 
| 121 | 
         
            -
                base_url = 'http://export.arxiv.org/api/query?';
         
     | 
| 122 | 
         
            -
                query = 'search_query=%s&start=%i&max_results=%i&sortBy=%s&sortOrder=%s' % (search_query,
         
     | 
| 123 | 
         
            -
                                                                 start,
         
     | 
| 124 | 
         
            -
                                                                 max_results,sort_by,sort_order)
         
     | 
| 125 | 
         
            -
             
     | 
| 126 | 
         
            -
                response = urllib.request.urlopen(base_url+query).read()
         
     | 
| 127 | 
         
            -
                feed = feedparser.parse(response)
         
     | 
| 128 | 
         
            -
                return feed
         
     | 
| 129 | 
         
            -
             
     | 
| 130 | 
         
            -
            def find_papers_by_author(auth_name):
         
     | 
| 131 | 
         
            -
             
     | 
| 132 | 
         
            -
                doc_ids = []
         
     | 
| 133 | 
         
            -
                for doc_id in range(len(all_authors)):
         
     | 
| 134 | 
         
            -
                    for auth_id in range(len(all_authors[doc_id])):
         
     | 
| 135 | 
         
            -
                        if auth_name.lower() in all_authors[doc_id][auth_id]['name'].lower():
         
     | 
| 136 | 
         
            -
                            print('Doc ID: ',doc_id, ' | arXiv: ', all_arxivid[doc_id], '| ', all_titles[doc_id],' | Author entry: ', all_authors[doc_id][auth_id]['name'])
         
     | 
| 137 | 
         
            -
                            doc_ids.append(doc_id)
         
     | 
| 138 | 
         
            -
             
     | 
| 139 | 
         
            -
                return doc_ids
         
     | 
| 140 | 
         
            -
             
     | 
| 141 | 
         
            -
            def faiss_based_indices(input_vector, nindex=10):
         
     | 
| 142 | 
         
            -
                xq = input_vector.reshape(-1,1).T.astype('float32')
         
     | 
| 143 | 
         
            -
                D, I = index.search(xq, nindex)
         
     | 
| 144 | 
         
            -
                return I[0], D[0]
         
     | 
| 145 | 
         
            -
             
     | 
| 146 | 
         
            -
            def list_similar_papers_v2(model_data,
         
     | 
| 147 | 
         
            -
                                    doc_id = [], input_type = 'doc_id',
         
     | 
| 148 | 
         
            -
                                    show_authors = False, show_summary = False,
         
     | 
| 149 | 
         
            -
                                    return_n = 10):
         
     | 
| 150 | 
         
            -
             
     | 
| 151 | 
         
            -
                arxiv_ada_embeddings, embeddings, all_titles, all_abstracts, all_authors = model_data
         
     | 
| 152 | 
         
            -
             
     | 
| 153 | 
         
            -
                if input_type == 'doc_id':
         
     | 
| 154 | 
         
            -
                    print('Doc ID: ',doc_id,', title: ',all_titles[doc_id])
         
     | 
| 155 | 
         
            -
            #         inferred_vector = model.infer_vector(train_corpus[doc_id].words)
         
     | 
| 156 | 
         
            -
                    inferred_vector = arxiv_ada_embeddings[doc_id,0:]
         
     | 
| 157 | 
         
            -
                    start_range = 1
         
     | 
| 158 | 
         
            -
                elif input_type == 'arxiv_id':
         
     | 
| 159 | 
         
            -
                    print('ArXiv id: ',doc_id)
         
     | 
| 160 | 
         
            -
                    arxiv_query_feed = run_simple_query(search_query='id:'+str(doc_id))
         
     | 
| 161 | 
         
            -
                    if len(arxiv_query_feed.entries) == 0:
         
     | 
| 162 | 
         
            -
                        print('error: arxiv id not found.')
         
     | 
| 163 | 
         
            -
                        return
         
     | 
| 164 | 
         
            -
                    else:
         
     | 
| 165 | 
         
            -
                        print('Title: '+arxiv_query_feed.entries[0].title)
         
     | 
| 166 | 
         
            -
                        inferred_vector = np.array(embeddings.embed_query(arxiv_query_feed.entries[0].summary))
         
     | 
| 167 | 
         
            -
                    start_range = 0
         
     | 
| 168 | 
         
            -
                elif input_type == 'keywords':
         
     | 
| 169 | 
         
            -
                    inferred_vector = np.array(embeddings.embed_query(doc_id))
         
     | 
| 170 | 
         
            -
                    start_range = 0
         
     | 
| 171 | 
         
            -
                else:
         
     | 
| 172 | 
         
            -
                    print('unrecognized input type.')
         
     | 
| 173 | 
         
            -
                    return
         
     | 
| 174 | 
         
            -
             
     | 
| 175 | 
         
            -
                sims, dists = faiss_based_indices(inferred_vector, return_n+2)
         
     | 
| 176 | 
         
            -
                textstr = ''
         
     | 
| 177 | 
         
            -
                abstracts_relevant = []
         
     | 
| 178 | 
         
            -
                fhdrs = []
         
     | 
| 179 | 
         
            -
             
     | 
| 180 | 
         
            -
                for i in range(start_range,start_range+return_n):
         
     | 
| 181 | 
         
            -
             
     | 
| 182 | 
         
            -
                    abstracts_relevant.append(all_text[sims[i]])
         
     | 
| 183 | 
         
            -
                    fhdr = str(sims[i])+'_'+all_authors[sims[i]][0]['name'].split()[-1] + all_arxivid[sims[i]][0:2] +'_'+ all_arxivid[sims[i]]
         
     | 
| 184 | 
         
            -
                    fhdrs.append(fhdr)
         
     | 
| 185 | 
         
            -
                    textstr = textstr + str(i+1)+'. **'+ all_titles[sims[i]] +'** (Distance: %.2f' %dists[i]+')   \n'
         
     | 
| 186 | 
         
            -
                    textstr = textstr + '**ArXiv:** ['+all_arxivid[sims[i]]+'](https://arxiv.org/abs/'+all_arxivid[sims[i]]+')  \n'
         
     | 
| 187 | 
         
            -
                    if show_authors == True:
         
     | 
| 188 | 
         
            -
                        textstr = textstr + '**Authors:**  '
         
     | 
| 189 | 
         
            -
                        temp = all_authors[sims[i]]
         
     | 
| 190 | 
         
            -
                        for ak in range(len(temp)):
         
     | 
| 191 | 
         
            -
                            if ak < len(temp)-1:
         
     | 
| 192 | 
         
            -
                                textstr = textstr + temp[ak].name + ', '
         
     | 
| 193 | 
         
            -
                            else:
         
     | 
| 194 | 
         
            -
                                textstr = textstr + temp[ak].name + '   \n'
         
     | 
| 195 | 
         
            -
                    if show_summary == True:
         
     | 
| 196 | 
         
            -
                        textstr = textstr + '**Summary:**  '
         
     | 
| 197 | 
         
            -
                        text = all_text[sims[i]]
         
     | 
| 198 | 
         
            -
                        text = text.replace('\n', ' ')
         
     | 
| 199 | 
         
            -
                        textstr = textstr + summarizer.summarize(text) + '  \n'
         
     | 
| 200 | 
         
            -
                    if show_authors == True or show_summary == True:
         
     | 
| 201 | 
         
            -
                        textstr = textstr + ' '
         
     | 
| 202 | 
         
            -
                    textstr = textstr + '  \n'
         
     | 
| 203 | 
         
            -
                return textstr, abstracts_relevant, fhdrs, sims
         
     | 
| 204 | 
         
            -
             
     | 
| 205 | 
         
            -
             
     | 
| 206 | 
         
            -
            def generate_chat_completion(messages, model="gpt-4", temperature=1, max_tokens=None):
         
     | 
| 207 | 
         
            -
                headers = {
         
     | 
| 208 | 
         
            -
                    "Content-Type": "application/json",
         
     | 
| 209 | 
         
            -
                    "Authorization": f"Bearer {openai.api_key}",
         
     | 
| 210 | 
         
            -
                }
         
     | 
| 211 | 
         
            -
             
     | 
| 212 | 
         
            -
                data = {
         
     | 
| 213 | 
         
            -
                    "model": model,
         
     | 
| 214 | 
         
            -
                    "messages": messages,
         
     | 
| 215 | 
         
            -
                    "temperature": temperature,
         
     | 
| 216 | 
         
            -
                }
         
     | 
| 217 | 
         
            -
             
     | 
| 218 | 
         
            -
                if max_tokens is not None:
         
     | 
| 219 | 
         
            -
                    data["max_tokens"] = max_tokens
         
     | 
| 220 | 
         
            -
                response = requests.post(API_ENDPOINT, headers=headers, data=json.dumps(data))
         
     | 
| 221 | 
         
            -
                if response.status_code == 200:
         
     | 
| 222 | 
         
            -
                    return response.json()["choices"][0]["message"]["content"]
         
     | 
| 223 | 
         
            -
                else:
         
     | 
| 224 | 
         
            -
                    raise Exception(f"Error {response.status_code}: {response.text}")
         
     | 
| 225 | 
         
            -
             
     | 
| 226 | 
         
            -
            model_data = [arxiv_ada_embeddings, embeddings, all_titles, all_text, all_authors]
         
     | 
| 227 | 
         
            -
             
     | 
| 228 | 
         
            -
            def format_docs(docs):
         
     | 
| 229 | 
         
            -
                return "\n\n".join(doc.page_content for doc in docs)
         
     | 
| 230 | 
         
            -
             
     | 
| 231 | 
         
            -
            def get_textstr(i, show_authors=False, show_summary=False):
         
     | 
| 232 | 
         
            -
                textstr = ''
         
     | 
| 233 | 
         
            -
                textstr = '**'+ all_titles[i] +'**   \n'
         
     | 
| 234 | 
         
            -
                textstr = textstr + '**ArXiv:** ['+all_arxivid[i]+'](https://arxiv.org/abs/'+all_arxivid[i]+')  \n'
         
     | 
| 235 | 
         
            -
                if show_authors == True:
         
     | 
| 236 | 
         
            -
                    textstr = textstr + '**Authors:**  '
         
     | 
| 237 | 
         
            -
                    temp = all_authors[i]
         
     | 
| 238 | 
         
            -
                    for ak in range(len(temp)):
         
     | 
| 239 | 
         
            -
                        if ak < len(temp)-1:
         
     | 
| 240 | 
         
            -
                            textstr = textstr + temp[ak].name + ', '
         
     | 
| 241 | 
         
            -
                        else:
         
     | 
| 242 | 
         
            -
                            textstr = textstr + temp[ak].name + '   \n'
         
     | 
| 243 | 
         
            -
                if show_summary == True:
         
     | 
| 244 | 
         
            -
                    textstr = textstr + '**Summary:**  '
         
     | 
| 245 | 
         
            -
                    text = all_text[i]
         
     | 
| 246 | 
         
            -
                    text = text.replace('\n', ' ')
         
     | 
| 247 | 
         
            -
                    textstr = textstr + summarizer.summarize(text) + '  \n'
         
     | 
| 248 | 
         
            -
                if show_authors == True or show_summary == True:
         
     | 
| 249 | 
         
            -
                    textstr = textstr + ' '
         
     | 
| 250 | 
         
            -
                textstr = textstr + '  \n'
         
     | 
| 251 | 
         
            -
             
     | 
| 252 | 
         
            -
                return textstr
         
     | 
| 253 | 
         
            -
             
     | 
| 254 | 
         
            -
             
     | 
| 255 | 
         
            -
            def run_rag(query, return_n = 10, show_authors = True, show_summary = True):
         
     | 
| 256 | 
         
            -
             
     | 
| 257 | 
         
            -
                sims, absts, fhdrs, simids = list_similar_papers_v2(model_data,
         
     | 
| 258 | 
         
            -
                                              doc_id = query,
         
     | 
| 259 | 
         
            -
                                              input_type='keywords',
         
     | 
| 260 | 
         
            -
                                              show_authors = show_authors, show_summary = show_summary,
         
     | 
| 261 | 
         
            -
                                              return_n = return_n)
         
     | 
| 262 | 
         
            -
             
     | 
| 263 | 
         
            -
                temp_abst = ''
         
     | 
| 264 | 
         
            -
                loaders = []
         
     | 
| 265 | 
         
            -
                for i in range(len(absts)):
         
     | 
| 266 | 
         
            -
                    temp_abst = absts[i]
         
     | 
| 267 | 
         
            -
             
     | 
| 268 | 
         
            -
                    try:
         
     | 
| 269 | 
         
            -
                        text_file = open("absts/"+fhdrs[i]+".txt", "w")
         
     | 
| 270 | 
         
            -
                    except:
         
     | 
| 271 | 
         
            -
                        os.mkdir('absts')
         
     | 
| 272 | 
         
            -
                        text_file = open("absts/"+fhdrs[i]+".txt", "w")
         
     | 
| 273 | 
         
            -
                    n = text_file.write(temp_abst)
         
     | 
| 274 | 
         
            -
                    text_file.close()
         
     | 
| 275 | 
         
            -
                    loader = TextLoader("absts/"+fhdrs[i]+".txt")
         
     | 
| 276 | 
         
            -
                    loaders.append(loader)
         
     | 
| 277 | 
         
            -
             
     | 
| 278 | 
         
            -
                text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
         
     | 
| 279 | 
         
            -
                splits = text_splitter.split_documents([loader.load()[0] for loader in loaders])
         
     | 
| 280 | 
         
            -
                vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)
         
     | 
| 281 | 
         
            -
                retriever = vectorstore.as_retriever()
         
     | 
| 282 | 
         
            -
             
     | 
| 283 | 
         
            -
                template = """You are an assistant with expertise in astrophysics for question-answering tasks.
         
     | 
| 284 | 
         
            -
                Use the following pieces of retrieved context from the literature to answer the question.
         
     | 
| 285 | 
         
            -
                If you don't know the answer, just say that you don't know.
         
     | 
| 286 | 
         
            -
                Use six sentences maximum and keep the answer concise.
         
     | 
| 287 | 
         
            -
             
     | 
| 288 | 
         
            -
                {context}
         
     | 
| 289 | 
         
            -
             
     | 
| 290 | 
         
            -
                Question: {question}
         
     | 
| 291 | 
         
            -
             
     | 
| 292 | 
         
            -
                Answer:"""
         
     | 
| 293 | 
         
            -
                custom_rag_prompt = PromptTemplate.from_template(template)
         
     | 
| 294 | 
         
            -
             
     | 
| 295 | 
         
            -
                rag_chain_from_docs = (
         
     | 
| 296 | 
         
            -
                    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
         
     | 
| 297 | 
         
            -
                    | custom_rag_prompt
         
     | 
| 298 | 
         
            -
                    | llm
         
     | 
| 299 | 
         
            -
                    | StrOutputParser()
         
     | 
| 300 | 
         
            -
                )
         
     | 
| 301 | 
         
            -
             
     | 
| 302 | 
         
            -
                rag_chain_with_source = RunnableParallel(
         
     | 
| 303 | 
         
            -
                    {"context": retriever, "question": RunnablePassthrough()}
         
     | 
| 304 | 
         
            -
                ).assign(answer=rag_chain_from_docs)
         
     | 
| 305 | 
         
            -
             
     | 
| 306 | 
         
            -
                rag_answer = rag_chain_with_source.invoke(query)
         
     | 
| 307 | 
         
            -
             
     | 
| 308 | 
         
            -
                st.markdown('### User query: '+query)
         
     | 
| 309 | 
         
            -
             
     | 
| 310 | 
         
            -
                st.markdown(rag_answer['answer'])
         
     | 
| 311 | 
         
            -
                opstr = '#### Primary sources: \n'
         
     | 
| 312 | 
         
            -
                srcnames = []
         
     | 
| 313 | 
         
            -
                for i in range(len(rag_answer['context'])):
         
     | 
| 314 | 
         
            -
                    srcnames.append(rag_answer['context'][0].metadata['source'])
         
     | 
| 315 | 
         
            -
             
     | 
| 316 | 
         
            -
                srcnames = np.unique(srcnames)
         
     | 
| 317 | 
         
            -
                srcindices = []
         
     | 
| 318 | 
         
            -
                for i in range(len(srcnames)):
         
     | 
| 319 | 
         
            -
                    temp = srcnames[i].split('_')[1]
         
     | 
| 320 | 
         
            -
                    srcindices.append(int(srcnames[i].split('_')[0].split('/')[1]))
         
     | 
| 321 | 
         
            -
                    if int(temp[-2:]) < 40:
         
     | 
| 322 | 
         
            -
                        temp = temp[0:-2] + ' et al. 20' + temp[-2:]
         
     | 
| 323 | 
         
            -
                    else:
         
     | 
| 324 | 
         
            -
                        temp = temp[0:-2] + ' et al. 19' + temp[-2:]
         
     | 
| 325 | 
         
            -
                    temp = '['+temp+']('+all_links[int(srcnames[i].split('_')[0].split('/')[1])]+')'
         
     | 
| 326 | 
         
            -
                    st.markdown(temp)
         
     | 
| 327 | 
         
            -
                abs_indices = np.array(srcindices)
         
     | 
| 328 | 
         
            -
             
     | 
| 329 | 
         
            -
                fig = plt.figure(figsize=(9,9))
         
     | 
| 330 | 
         
            -
                plt.scatter(e2d[0:,0], e2d[0:,1],s=2)
         
     | 
| 331 | 
         
            -
                plt.scatter(e2d[simids,0], e2d[simids,1],s=30)
         
     | 
| 332 | 
         
            -
                plt.scatter(e2d[abs_indices,0], e2d[abs_indices,1],s=100,color='k',marker='d')
         
     | 
| 333 | 
         
            -
                plt.title('localization for question: '+query)
         
     | 
| 334 | 
         
            -
                st.pyplot(fig)
         
     | 
| 335 | 
         
            -
             
     | 
| 336 | 
         
            -
                st.markdown('\n #### List of relevant papers:')
         
     | 
| 337 | 
         
            -
                st.markdown(sims)
         
     | 
| 338 | 
         
            -
             
     | 
| 339 | 
         
            -
                return rag_answer
         
     | 
| 340 | 
         
            -
             
     | 
| 341 | 
         
            -
             
     | 
| 342 | 
         
            -
            st.title('ArXiv-based question answering')
         
     | 
| 343 | 
         
            -
            st.markdown('[Includes papers up to: `'+dateval+'`]')
         
     | 
| 344 | 
         
            -
            st.markdown('Concise answers for questions using arxiv abstracts + GPT-4. You might need to wait for a few seconds for the GPT-4 query to return an answer (check top right corner to see if it is still running).')
         
     | 
| 345 | 
         
            -
            st.markdown('The answers are followed by relevant source(s) used in the answer, a graph showing which part of the astro-ph.GA manifold it drew the answer from (tightly clustered points generally indicate high quality/consensus answers) followed by a bunch of relevant papers used by the RAG to compose the answer.')
         
     | 
| 346 | 
         
            -
            st.markdown('If this does not satisfactorily answer your question or rambles too much, you can also try the older `qa_sources_v1` page.')
         
     | 
| 347 | 
         
            -
             
     | 
| 348 | 
         
            -
            query = st.text_input('Your question here:',
         
     | 
| 349 | 
         
            -
            value="What causes galaxy quenching at high redshifts?")
         
     | 
| 350 | 
         
            -
            return_n = st.slider('How many papers should I show?', 1, 30, 10)
         
     | 
| 351 | 
         
            -
             
     | 
| 352 | 
         
            -
            sims = run_rag(query, return_n = return_n)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
    	
        pages/8_arxiv_embedding_explorer_2024.py
    DELETED
    
    | 
         @@ -1,121 +0,0 @@ 
     | 
|
| 1 | 
         
            -
            import streamlit as st
         
     | 
| 2 | 
         
            -
            import pandas as pd
         
     | 
| 3 | 
         
            -
            import numpy as np
         
     | 
| 4 | 
         
            -
            import matplotlib.pyplot as plt
         
     | 
| 5 | 
         
            -
            import pickle
         
     | 
| 6 | 
         
            -
            from bokeh.palettes import OrRd
         
     | 
| 7 | 
         
            -
            from bokeh.plotting import figure, show
         
     | 
| 8 | 
         
            -
            from bokeh.plotting import ColumnDataSource, figure, output_notebook, show
         
     | 
| 9 | 
         
            -
            import cloudpickle as cp
         
     | 
| 10 | 
         
            -
            import pickle
         
     | 
| 11 | 
         
            -
            from scipy import stats
         
     | 
| 12 | 
         
            -
            from urllib.request import urlopen
         
     | 
| 13 | 
         
            -
             
     | 
| 14 | 
         
            -
            @st.cache_data
         
     | 
| 15 | 
         
            -
            def get_feeds_data(url):
         
     | 
| 16 | 
         
            -
                # data = cp.load(urlopen(url))
         
     | 
| 17 | 
         
            -
                with open(url, "rb") as fp:
         
     | 
| 18 | 
         
            -
                    data = pickle.load(fp)
         
     | 
| 19 | 
         
            -
                st.sidebar.success("Fetched data from API!")
         
     | 
| 20 | 
         
            -
                return data
         
     | 
| 21 | 
         
            -
             
     | 
| 22 | 
         
            -
            # embeddings = OpenAIEmbeddings()
         
     | 
| 23 | 
         
            -
             
     | 
| 24 | 
         
            -
            dateval = "16-Jun-2024"
         
     | 
| 25 | 
         
            -
            feeds_link = "local_files/astro_ph_ga_feeds_upto_"+dateval+".pkl"
         
     | 
| 26 | 
         
            -
            embed_link = "local_files/astro_ph_ga_feeds_ada_embedding_"+dateval+".pkl"
         
     | 
| 27 | 
         
            -
            gal_feeds = get_feeds_data(feeds_link)
         
     | 
| 28 | 
         
            -
            arxiv_ada_embeddings = get_feeds_data(embed_link)
         
     | 
| 29 | 
         
            -
             
     | 
| 30 | 
         
            -
            @st.cache_data
         
     | 
| 31 | 
         
            -
            def get_embedding_data(url):
         
     | 
| 32 | 
         
            -
                # data = cp.load(urlopen(url))
         
     | 
| 33 | 
         
            -
                with open(url, "rb") as fp:
         
     | 
| 34 | 
         
            -
                    data = pickle.load(fp)
         
     | 
| 35 | 
         
            -
                st.sidebar.success("Fetched data from API!")
         
     | 
| 36 | 
         
            -
                return data
         
     | 
| 37 | 
         
            -
             
     | 
| 38 | 
         
            -
            url = "local_files/astro_ph_ga_embedding_"+dateval+".pkl"
         
     | 
| 39 | 
         
            -
            # e2d, _, _, _, _ = get_embedding_data(url)
         
     | 
| 40 | 
         
            -
            embedding = get_embedding_data(url)
         
     | 
| 41 | 
         
            -
             
     | 
| 42 | 
         
            -
            st.title("ArXiv+GPT3 embedding explorer")
         
     | 
| 43 | 
         
            -
            st.markdown('[Includes papers up to: `'+dateval+'`]')
         
     | 
| 44 | 
         
            -
            st.markdown("This is an explorer for astro-ph.GA papers on the arXiv (up to Apt 18th, 2023). The papers have been preprocessed with `chaotic_neural` [(link)](http://chaotic-neural.readthedocs.io/) after which the collected abstracts are run through `text-embedding-ada-002` with [langchain](https://python.langchain.com/en/latest/ecosystem/openai.html) to generate a unique vector correpsonding to each paper. These are then compressed using [umap](https://umap-learn.readthedocs.io/en/latest/) and shown here, and can be used for similarity searches with methods like [faiss](https://github.com/facebookresearch/faiss). The scatterplot here can be paired with a heatmap for more targeted searches looking at a specific topic or area (see sidebar). Upgrade to chaotic neural suggested by Jo Ciucă, thank you! More to come (hopefully) with GPT-4 and its applications!")
         
     | 
| 45 | 
         
            -
            st.markdown("Interpreting the UMAP plot: the algorithm creates a 2d embedding from the high-dim vector space that tries to conserve as much similarity information as possible. Nearby points in UMAP space are similar, and grow dissimiliar as you move farther away. The axes do not have any physical meaning.")
         
     | 
| 46 | 
         
            -
             
     | 
| 47 | 
         
            -
            from tqdm import tqdm
         
     | 
| 48 | 
         
            -
            ctr = -1
         
     | 
| 49 | 
         
            -
            num_chunks = len(gal_feeds)
         
     | 
| 50 | 
         
            -
            all_text = []
         
     | 
| 51 | 
         
            -
            all_titles = []
         
     | 
| 52 | 
         
            -
            all_arxivid = []
         
     | 
| 53 | 
         
            -
            all_links = []
         
     | 
| 54 | 
         
            -
             
     | 
| 55 | 
         
            -
            for nc in tqdm(range(num_chunks)):
         
     | 
| 56 | 
         
            -
                for i in range(len(gal_feeds[nc].entries)):
         
     | 
| 57 | 
         
            -
                    text = gal_feeds[nc].entries[i].summary
         
     | 
| 58 | 
         
            -
                    text = text.replace('\n', ' ')
         
     | 
| 59 | 
         
            -
                    text = text.replace('\\', '')
         
     | 
| 60 | 
         
            -
                    all_text.append(text)
         
     | 
| 61 | 
         
            -
                    all_titles.append(gal_feeds[nc].entries[i].title)
         
     | 
| 62 | 
         
            -
                    all_arxivid.append(gal_feeds[nc].entries[i].id.split('/')[-1][0:-2])
         
     | 
| 63 | 
         
            -
                    all_links.append(gal_feeds[nc].entries[i].links[1].href)
         
     | 
| 64 | 
         
            -
             
     | 
| 65 | 
         
            -
             
     | 
| 66 | 
         
            -
            def density_estimation(m1, m2, xmin=0, ymin=0, xmax=15, ymax=15):
         
     | 
| 67 | 
         
            -
                X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
         
     | 
| 68 | 
         
            -
                positions = np.vstack([X.ravel(), Y.ravel()])
         
     | 
| 69 | 
         
            -
                values = np.vstack([m1, m2])
         
     | 
| 70 | 
         
            -
                kernel = stats.gaussian_kde(values)
         
     | 
| 71 | 
         
            -
                Z = np.reshape(kernel(positions).T, X.shape)
         
     | 
| 72 | 
         
            -
                return X, Y, Z
         
     | 
| 73 | 
         
            -
             
     | 
| 74 | 
         
            -
            st.sidebar.markdown('This is a widget that allows you to look for papers containing specific phrases in the dataset and show it as a heatmap. Enter the phrase of interest, then change the size and opacity of the heatmap as desired to find the high-density regions. Hover over blue points to see the details of individual papers.')
         
     | 
| 75 | 
         
            -
            st.sidebar.markdown('`Note`: (i) if you enter a query that is not in the corpus of abstracts, it will return an error. just enter a different query in that case. (ii) there are some empty tooltips when you hover, these correspond to the underlying hexbins, and can be ignored.')
         
     | 
| 76 | 
         
            -
             
     | 
| 77 | 
         
            -
            st.sidebar.text_input("Search query", key="phrase", value="Quenching")
         
     | 
| 78 | 
         
            -
            alpha_value = st.sidebar.slider("Pick the hexbin opacity",0.0,1.0,0.81)
         
     | 
| 79 | 
         
            -
            size_value = st.sidebar.slider("Pick the hexbin gridsize",10,50,20)
         
     | 
| 80 | 
         
            -
             
     | 
| 81 | 
         
            -
            phrase=st.session_state.phrase
         
     | 
| 82 | 
         
            -
             
     | 
| 83 | 
         
            -
            phrase_flags = np.zeros((len(all_text),))
         
     | 
| 84 | 
         
            -
            for i in range(len(all_text)):
         
     | 
| 85 | 
         
            -
                if phrase.lower() in all_text[i].lower():
         
     | 
| 86 | 
         
            -
                    phrase_flags[i] = 1
         
     | 
| 87 | 
         
            -
             
     | 
| 88 | 
         
            -
             
     | 
| 89 | 
         
            -
            source = ColumnDataSource(data=dict(
         
     | 
| 90 | 
         
            -
                x=embedding[0:,0],
         
     | 
| 91 | 
         
            -
                y=embedding[0:,1],
         
     | 
| 92 | 
         
            -
                title=all_titles,
         
     | 
| 93 | 
         
            -
                link=all_links,
         
     | 
| 94 | 
         
            -
            ))
         
     | 
| 95 | 
         
            -
             
     | 
| 96 | 
         
            -
            TOOLTIPS = """
         
     | 
| 97 | 
         
            -
            <div style="width:300px;">
         
     | 
| 98 | 
         
            -
            ID: $index
         
     | 
| 99 | 
         
            -
            ($x, $y)
         
     | 
| 100 | 
         
            -
            @title <br>
         
     | 
| 101 | 
         
            -
            @link <br> <br>
         
     | 
| 102 | 
         
            -
            </div>
         
     | 
| 103 | 
         
            -
            """
         
     | 
| 104 | 
         
            -
             
     | 
| 105 | 
         
            -
            p = figure(width=700, height=583, tooltips=TOOLTIPS, x_range=(0, 15), y_range=(2.5,15),
         
     | 
| 106 | 
         
            -
                       title="UMAP projection of embeddings for the astro-ph.GA corpus"+phrase)
         
     | 
| 107 | 
         
            -
             
     | 
| 108 | 
         
            -
            # p.hexbin(embedding[phrase_flags==1,0],embedding[phrase_flags==1,1], size=size_value,
         
     | 
| 109 | 
         
            -
            #          palette = np.flip(OrRd[8]), alpha=alpha_value)
         
     | 
| 110 | 
         
            -
            p.circle('x', 'y', size=3, source=source, alpha=0.3)
         
     | 
| 111 | 
         
            -
            st.bokeh_chart(p)
         
     | 
| 112 | 
         
            -
             
     | 
| 113 | 
         
            -
            fig = plt.figure(figsize=(10.5,9*0.8328))
         
     | 
| 114 | 
         
            -
            plt.scatter(embedding[0:,0], embedding[0:,1],s=2,alpha=0.1)
         
     | 
| 115 | 
         
            -
            plt.hexbin(embedding[phrase_flags==1,0],embedding[phrase_flags==1,1],
         
     | 
| 116 | 
         
            -
            gridsize=size_value, cmap = 'viridis', alpha=alpha_value,extent=(-1,16,1.5,16),mincnt=10)
         
     | 
| 117 | 
         
            -
            plt.title("UMAP localization of heatmap keyword: "+phrase)
         
     | 
| 118 | 
         
            -
            plt.axis([0,15,2.5,15]);
         
     | 
| 119 | 
         
            -
            clbr = plt.colorbar(); clbr.set_label('# papers')
         
     | 
| 120 | 
         
            -
            plt.axis('off')
         
     | 
| 121 | 
         
            -
            st.pyplot(fig)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
    	
        pages/9_research_hotspots_2024.py
    DELETED
    
    | 
         @@ -1,130 +0,0 @@ 
     | 
|
| 1 | 
         
            -
            import os
         
     | 
| 2 | 
         
            -
            import datetime
         
     | 
| 3 | 
         
            -
            import faiss
         
     | 
| 4 | 
         
            -
            import streamlit as st
         
     | 
| 5 | 
         
            -
            import feedparser
         
     | 
| 6 | 
         
            -
            import urllib
         
     | 
| 7 | 
         
            -
            import cloudpickle as cp
         
     | 
| 8 | 
         
            -
            import pickle
         
     | 
| 9 | 
         
            -
            from urllib.request import urlopen
         
     | 
| 10 | 
         
            -
            from summa import summarizer
         
     | 
| 11 | 
         
            -
            import numpy as np
         
     | 
| 12 | 
         
            -
            import matplotlib.pyplot as plt
         
     | 
| 13 | 
         
            -
            import requests
         
     | 
| 14 | 
         
            -
            import json
         
     | 
| 15 | 
         
            -
            from scipy import ndimage
         
     | 
| 16 | 
         
            -
             
     | 
| 17 | 
         
            -
            from langchain_openai import AzureOpenAIEmbeddings
         
     | 
| 18 | 
         
            -
            from langchain.llms import OpenAI
         
     | 
| 19 | 
         
            -
            from langchain_openai import AzureChatOpenAI
         
     | 
| 20 | 
         
            -
             
     | 
| 21 | 
         
            -
            os.environ["OPENAI_API_TYPE"] = "azure"
         
     | 
| 22 | 
         
            -
            os.environ["AZURE_ENDPOINT"] = st.secrets["endpoint1"]
         
     | 
| 23 | 
         
            -
            os.environ["OPENAI_API_KEY"] = st.secrets["key1"]
         
     | 
| 24 | 
         
            -
            os.environ["OPENAI_API_VERSION"] = "2023-05-15"
         
     | 
| 25 | 
         
            -
             
     | 
| 26 | 
         
            -
            embeddings = AzureOpenAIEmbeddings(
         
     | 
| 27 | 
         
            -
                deployment="embedding",
         
     | 
| 28 | 
         
            -
                model="text-embedding-ada-002",
         
     | 
| 29 | 
         
            -
                azure_endpoint=st.secrets["endpoint1"],
         
     | 
| 30 | 
         
            -
            )
         
     | 
| 31 | 
         
            -
             
     | 
| 32 | 
         
            -
            llm = AzureChatOpenAI(
         
     | 
| 33 | 
         
            -
                    deployment_name="gpt4_small",
         
     | 
| 34 | 
         
            -
                    openai_api_version="2023-12-01-preview",
         
     | 
| 35 | 
         
            -
                    azure_endpoint=st.secrets["endpoint2"],
         
     | 
| 36 | 
         
            -
                    openai_api_key=st.secrets["key2"],
         
     | 
| 37 | 
         
            -
                    openai_api_type="azure",
         
     | 
| 38 | 
         
            -
                    temperature=0.
         
     | 
| 39 | 
         
            -
                )
         
     | 
| 40 | 
         
            -
             
     | 
| 41 | 
         
            -
             
     | 
| 42 | 
         
            -
            @st.cache_data
         
     | 
| 43 | 
         
            -
            def get_feeds_data(url):
         
     | 
| 44 | 
         
            -
                # data = cp.load(urlopen(url))
         
     | 
| 45 | 
         
            -
                with open(url, "rb") as fp:
         
     | 
| 46 | 
         
            -
                    data = pickle.load(fp)
         
     | 
| 47 | 
         
            -
                st.sidebar.success("Loaded data")
         
     | 
| 48 | 
         
            -
                return data
         
     | 
| 49 | 
         
            -
             
     | 
| 50 | 
         
            -
            # feeds_link = "https://drive.google.com/uc?export=download&id=1-IPk1voyUM9VqnghwyVrM1dY6rFnn1S_"
         
     | 
| 51 | 
         
            -
            # embed_link = "https://dl.dropboxusercontent.com/s/ob2betm29qrtb8v/astro_ph_ga_feeds_ada_embedding_18-Apr-2023.pkl?dl=0"
         
     | 
| 52 | 
         
            -
            dateval = "16-Jun-2024"
         
     | 
| 53 | 
         
            -
            feeds_link = "local_files/astro_ph_ga_feeds_upto_"+dateval+".pkl"
         
     | 
| 54 | 
         
            -
            embed_link = "local_files/astro_ph_ga_feeds_ada_embedding_"+dateval+".pkl"
         
     | 
| 55 | 
         
            -
            gal_feeds = get_feeds_data(feeds_link)
         
     | 
| 56 | 
         
            -
            arxiv_ada_embeddings = get_feeds_data(embed_link)
         
     | 
| 57 | 
         
            -
             
     | 
| 58 | 
         
            -
            @st.cache_data
         
     | 
| 59 | 
         
            -
            def get_embedding_data(url):
         
     | 
| 60 | 
         
            -
                # data = cp.load(urlopen(url))
         
     | 
| 61 | 
         
            -
                with open(url, "rb") as fp:
         
     | 
| 62 | 
         
            -
                    data = pickle.load(fp)
         
     | 
| 63 | 
         
            -
                st.sidebar.success("Fetched data from API!")
         
     | 
| 64 | 
         
            -
                return data
         
     | 
| 65 | 
         
            -
             
     | 
| 66 | 
         
            -
            # url = "https://drive.google.com/uc?export=download&id=1133tynMwsfdR1wxbkFLhbES3FwDWTPjP"
         
     | 
| 67 | 
         
            -
            url = "local_files/astro_ph_ga_embedding_"+dateval+".pkl"
         
     | 
| 68 | 
         
            -
            e2d = get_embedding_data(url)
         
     | 
| 69 | 
         
            -
            # e2d, _, _, _, _ = get_embedding_data(url)
         
     | 
| 70 | 
         
            -
             
     | 
| 71 | 
         
            -
            ctr = -1
         
     | 
| 72 | 
         
            -
            num_chunks = len(gal_feeds)
         
     | 
| 73 | 
         
            -
            ctr = -1
         
     | 
| 74 | 
         
            -
            num_chunks = len(gal_feeds)
         
     | 
| 75 | 
         
            -
            all_text, all_titles, all_arxivid, all_links, all_authors, all_pubdates, all_old = [], [], [], [], [], [], []
         
     | 
| 76 | 
         
            -
             
     | 
| 77 | 
         
            -
            for nc in range(num_chunks):
         
     | 
| 78 | 
         
            -
             
     | 
| 79 | 
         
            -
                for i in range(len(gal_feeds[nc].entries)):
         
     | 
| 80 | 
         
            -
                    text = gal_feeds[nc].entries[i].summary
         
     | 
| 81 | 
         
            -
                    text = text.replace('\n', ' ')
         
     | 
| 82 | 
         
            -
                    text = text.replace('\\', '')
         
     | 
| 83 | 
         
            -
                    all_text.append(text)
         
     | 
| 84 | 
         
            -
                    all_titles.append(gal_feeds[nc].entries[i].title)
         
     | 
| 85 | 
         
            -
                    all_arxivid.append(gal_feeds[nc].entries[i].id.split('/')[-1][0:-2])
         
     | 
| 86 | 
         
            -
                    all_links.append(gal_feeds[nc].entries[i].links[1].href)
         
     | 
| 87 | 
         
            -
                    all_authors.append(gal_feeds[nc].entries[i].authors)
         
     | 
| 88 | 
         
            -
                    temp = gal_feeds[nc].entries[i].published
         
     | 
| 89 | 
         
            -
                    datetime_object = datetime.datetime.strptime(temp[0:10]+' '+temp[11:-1], '%Y-%m-%d %H:%M:%S')
         
     | 
| 90 | 
         
            -
                    all_pubdates.append(datetime_object)
         
     | 
| 91 | 
         
            -
                    all_old.append((datetime.datetime.now() - datetime_object).days)
         
     | 
| 92 | 
         
            -
             
     | 
| 93 | 
         
            -
            def make_time_excess_plot(midage = 0, tolage = 1, onlyolder = False):
         
     | 
| 94 | 
         
            -
             
     | 
| 95 | 
         
            -
                bw = 0.05
         
     | 
| 96 | 
         
            -
                sigma = 4.0
         
     | 
| 97 | 
         
            -
                mask = (np.abs(np.array(all_old) - midage*365) < tolage*365)
         
     | 
| 98 | 
         
            -
             
     | 
| 99 | 
         
            -
                if onlyolder == True:
         
     | 
| 100 | 
         
            -
                    mask2 = (np.array(all_old) > midage*365 + tolage*365/2)
         
     | 
| 101 | 
         
            -
                    a = np.histogram2d(e2d[0:,0][mask2], e2d[0:,1][mask2], bins=(np.arange(0,17,bw)), density=True)
         
     | 
| 102 | 
         
            -
                else:
         
     | 
| 103 | 
         
            -
                    a = np.histogram2d(e2d[0:,0], e2d[0:,1], bins=(np.arange(0,17,bw)), density=True)
         
     | 
| 104 | 
         
            -
                b = np.histogram2d(e2d[0:,0][mask], e2d[0:,1][mask], bins=(np.arange(0,17,bw)), density=True)
         
     | 
| 105 | 
         
            -
                temp = b[0].T - a[0].T
         
     | 
| 106 | 
         
            -
                temp = ndimage.gaussian_filter(temp, sigma, mode='nearest')
         
     | 
| 107 | 
         
            -
                vscale = (np.nanpercentile(temp,99.5) - np.nanpercentile(temp,0.5))/2
         
     | 
| 108 | 
         
            -
             
     | 
| 109 | 
         
            -
                fig, ax = plt.subplots(1,1,figsize=(11,9))
         
     | 
| 110 | 
         
            -
                plt.pcolor(a[1][0:-1] + (a[1][1]-a[1][0])/2, a[2][0:-1] + (a[2][1]-a[2][0])/2,
         
     | 
| 111 | 
         
            -
                           temp,cmap='bwr',
         
     | 
| 112 | 
         
            -
                           vmin=-vscale,vmax=vscale); plt.colorbar()
         
     | 
| 113 | 
         
            -
                # plt.scatter(e2d[0:,0], e2d[0:,1],s=2,color='k',alpha=0.1)
         
     | 
| 114 | 
         
            -
                plt.title('excess research over the last %.1f yrs centered at %.1f yrs' %(tolage, midage))
         
     | 
| 115 | 
         
            -
                plt.axis([0,14,1,15])
         
     | 
| 116 | 
         
            -
                plt.axis('off')
         
     | 
| 117 | 
         
            -
                st.pyplot(fig)
         
     | 
| 118 | 
         
            -
                return
         
     | 
| 119 | 
         
            -
             
     | 
| 120 | 
         
            -
            st.title('Research hotspots')
         
     | 
| 121 | 
         
            -
            st.markdown('[Includes papers up to: `'+dateval+'`]')
         
     | 
| 122 | 
         
            -
             
     | 
| 123 | 
         
            -
            midage = st.slider('Age', 0., 10., 0.)
         
     | 
| 124 | 
         
            -
            tolage = st.slider('Period width', 0., 10., 1.)
         
     | 
| 125 | 
         
            -
             
     | 
| 126 | 
         
            -
            st.markdown('Compare the research in a given time period to the full manifold.')
         
     | 
| 127 | 
         
            -
            make_time_excess_plot(midage, tolage, onlyolder = False)
         
     | 
| 128 | 
         
            -
             
     | 
| 129 | 
         
            -
            st.markdown('Compare the research in a given time period to research older than that.')
         
     | 
| 130 | 
         
            -
            make_time_excess_plot(midage, tolage, onlyolder = True)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
    	
        pages/Untitled.ipynb
    DELETED
    
    | 
         @@ -1,6 +0,0 @@ 
     | 
|
| 1 | 
         
            -
            {
         
     | 
| 2 | 
         
            -
             "cells": [],
         
     | 
| 3 | 
         
            -
             "metadata": {},
         
     | 
| 4 | 
         
            -
             "nbformat": 4,
         
     | 
| 5 | 
         
            -
             "nbformat_minor": 5
         
     | 
| 6 | 
         
            -
            }
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
    	
        requirements.txt
    CHANGED
    
    | 
         @@ -8,7 +8,17 @@ langchain 
     | 
|
| 8 | 
         
             
            langchain_openai
         
     | 
| 9 | 
         
             
            langchain_community
         
     | 
| 10 | 
         
             
            langchain_core
         
     | 
| 
         | 
|
| 11 | 
         
             
            openai
         
     | 
| 
         | 
|
| 
         | 
|
| 12 | 
         
             
            feedparser
         
     | 
| 13 | 
         
             
            tiktoken
         
     | 
| 14 | 
         
             
            chromadb
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 8 | 
         
             
            langchain_openai
         
     | 
| 9 | 
         
             
            langchain_community
         
     | 
| 10 | 
         
             
            langchain_core
         
     | 
| 11 | 
         
            +
            langchainhub
         
     | 
| 12 | 
         
             
            openai
         
     | 
| 13 | 
         
            +
            instructor
         
     | 
| 14 | 
         
            +
            pydantic
         
     | 
| 15 | 
         
             
            feedparser
         
     | 
| 16 | 
         
             
            tiktoken
         
     | 
| 17 | 
         
             
            chromadb
         
     | 
| 18 | 
         
            +
            streamlit-extras
         
     | 
| 19 | 
         
            +
            nltk
         
     | 
| 20 | 
         
            +
            cohere
         
     | 
| 21 | 
         
            +
            duckduckgo-search
         
     | 
| 22 | 
         
            +
            pytextrank
         
     | 
| 23 | 
         
            +
            spacy==3.7.5
         
     | 
| 24 | 
         
            +
            https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
         
     |