%pip -q install \
  python-dotenv ipywidgets tqdm humanize \
  pandas numpy matplotlib altair \
  datasets sentence-transformers \
  elasticsearch

%load_ext dotenv
%dotenv

from tqdm.notebook import tqdm as notebook_tqdm


from sentence_transformers import SentenceTransformer

MODEL_ID="all-MiniLM-L6-v2"

model = SentenceTransformer(MODEL_ID)
print("Model dimensions:", model.get_sentence_embedding_dimension())

Model dimensions: 384


embeddings_for_cat = model.encode("cat")
print(list(embeddings_for_cat)[:5] + ["..."])

[0.03733039, 0.0511619, -0.00030606816, 0.060209926, -0.11749442, '...']


embeddings_for_dog = model.encode("dog")
print(list(embeddings_for_dog)[:5] + ["..."])

[-0.05314704, 0.014194381, 0.0071458234, 0.06860866, -0.07848035, '...']


import pandas as pd

df = pd.DataFrame(embeddings_for_cat, columns=["embedding"])
df


df.reset_index().plot.scatter(x="index", y="embedding");


df = pd.DataFrame(
    [
      [embeddings_for_cat],
      [embeddings_for_dog],
    ],
    index=["cat", "dog"], columns=["embeddings"]
)
df


# Add a new column to store the original index values (0-383) for each embedding
df["position"] = [list(range(len(df.embeddings[i]))) for i in df.index]

# Convert the `embeddings` and `position` columns from "wide" to "long" format
df_exploded = df.explode(["embeddings", "position"])

# Convert the index into a regular column
df_exploded = df_exploded.reset_index()

# Rename columns for more clarity
df_exploded = df_exploded.rename(columns={"index": "animal", "embeddings": "embedding"})

# Add a new column with numerical values mapped from the `animal` column values
df_exploded["color"] = df_exploded["animal"].map({"cat": 1, "dog": 2})

df_exploded


(df_exploded
  .plot
  .scatter(x="position", y="embedding", c="color", colormap="tab10")
  .collections[0].colorbar.remove())


import numpy as np
from sklearn.decomposition import PCA

# Drop the `position` column as it's no longer needed
df.drop(columns=["position"], inplace=True, errors="ignore")

# Convert embeddings to a 2D array and display their shape
print("Embeddings shape:", np.stack(df["embeddings"]).shape)

# Initialize the PCA reducer to convert embeddings into arrays of length of 2
reducer = PCA(n_components=2)

# Reduce the embeddings, store them in a new dataframe column and display their shape
df["reduced"] = reducer.fit_transform(np.stack(df["embeddings"])).tolist()
print("Reduced embeddings shape:", np.stack(df["reduced"]).shape)

df

Embeddings shape: (2, 384)
Reduced embeddings shape: (2, 2)


import altair as alt

def scatterplot(
    data: pd.DataFrame,
    tooltips=False,
    labels=False,
    width=800,
    height=200,
) -> alt.Chart:
    base_chart = (
        alt.Chart(data)
        .encode(
            alt.X("x", scale=alt.Scale(zero=False)),
            alt.Y("y", scale=alt.Scale(zero=False)),
        )
        .properties(width=width, height=height)
    )

    if tooltips:
        base_chart = base_chart.encode(alt.Tooltip(["text"]))

    circles = base_chart.mark_circle(
        size=200, color="crimson", stroke="white", strokeWidth=1
    )

    if labels:
        labels = base_chart.mark_text(
            fontSize=13,
            align="left",
            baseline="bottom",
            dx=5,
        ).encode(text="text")
        chart = circles + labels
    else:
        chart = circles

    return chart


source = pd.DataFrame(
    {
        "text": df.index,
        "x": df["reduced"].apply(lambda x: x[0]).to_list(),
        "y": df["reduced"].apply(lambda x: x[1]).to_list(),
    }
)

scatterplot(source, labels=True)


words = ["cat", "dog", "table", "chair", "pizza", "pasta", "asymptomatic"]

# Create a new dataframe
df = pd.DataFrame(
    [[model.encode(word)] for word in words],
    columns=["embeddings"],
    index=words,
)

# Perform dimensionality reduction
df["reduced"] = reducer.fit_transform(np.stack(df["embeddings"])).tolist()
df


source = pd.DataFrame(
    {
        "text": df.index,
        "x": df["reduced"].apply(lambda x: x[0]).to_list(),
        "y": df["reduced"].apply(lambda x: x[1]).to_list(),
    }
)

scatterplot(source, labels=True)


import humanize
import datasets

dataset = datasets.load_dataset("quora", split="train")

print("Description:", dataset.info.description, "\n")
print("Homepage:", dataset.info.homepage)
print("Downloaded size:", humanize.naturalsize(dataset.info.download_size))
print("Number of examples:", humanize.intcomma(dataset.info.splits["train"].num_examples))
print("Features:", dataset.info.features)

Description: The Quora dataset is composed of question pairs, and the task is to determine if the questions are paraphrases of each other (have the same meaning). 

Homepage: https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs
Downloaded size: 58.2 MB
Number of examples: 404,290
Features: {'questions': Sequence(feature={'id': Value(dtype='int32', id=None), 'text': Value(dtype='string', id=None)}, length=-1, id=None), 'is_duplicate': Value(dtype='bool', id=None)}


dataset[:5]

{'questions': [{'id': [1, 2],
   'text': ['What is the step by step guide to invest in share market in india?',
    'What is the step by step guide to invest in share market?']},
  {'id': [3, 4],
   'text': ['What is the story of Kohinoor (Koh-i-Noor) Diamond?',
    'What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?']},
  {'id': [5, 6],
   'text': ['How can I increase the speed of my internet connection while using a VPN?',
    'How can Internet speed be increased by hacking through DNS?']},
  {'id': [7, 8],
   'text': ['Why am I mentally very lonely? How can I solve it?',
    'Find the remainder when [math]23^{24}[/math] is divided by 24,23?']},
  {'id': [9, 10],
   'text': ['Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?',
    'Which fish would survive in salt water?']}],
 'is_duplicate': [False, False, False, False, False]}


(dataset
  .select(range(1000))
  .filter(lambda record: record["is_duplicate"])[:3])

{'questions': [{'id': [11, 12],
   'text': ['Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?',
    "I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?"]},
  {'id': [15, 16],
   'text': ['How can I be a good geologist?',
    'What should I do to be a great geologist?']},
  {'id': [23, 24],
   'text': ['How do I read and find my YouTube comments?',
    'How can I see all my Youtube comments?']}],
 'is_duplicate': [True, True, True]}


dataset.filter(lambda record: "What is the most populous state in the USA?" in record["questions"]["text"])[:]

{'questions': [], 'is_duplicate': []}


print("Original dataset:", dataset, "\n")

# Remove the `is_duplicate` column
dataset = dataset.remove_columns("is_duplicate")

# Flatten the dataset
dataset = dataset.flatten()

print("Transformed dataset:", dataset, "\n")

dataset[:5]

Original dataset: Dataset({
    features: ['questions', 'is_duplicate'],
    num_rows: 404290
}) 

Transformed dataset: Dataset({
    features: ['questions.id', 'questions.text'],
    num_rows: 404290
})

{'questions.id': [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]],
 'questions.text': [['What is the step by step guide to invest in share market in india?',
   'What is the step by step guide to invest in share market?'],
  ['What is the story of Kohinoor (Koh-i-Noor) Diamond?',
   'What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?'],
  ['How can I increase the speed of my internet connection while using a VPN?',
   'How can Internet speed be increased by hacking through DNS?'],
  ['Why am I mentally very lonely? How can I solve it?',
   'Find the remainder when [math]23^{24}[/math] is divided by 24,23?'],
  ['Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?',
   'Which fish would survive in salt water?']]}


# Expand the values from the lists into separate lists
def expand_values(batch):
    ids = []
    texts = []

    for id_list, text_list in zip(batch["questions.id"], batch["questions.text"]):
        ids.extend(id_list)
        texts.extend(text_list)

    return {"id": ids, "text": texts}

# Run the "expand_values" function for batches of rows in the dataset
dataset = dataset.map(
    expand_values,
    batched=True,
    remove_columns=dataset.column_names,
    desc="Expand Questions",
)

print("Transformed dataset:", dataset, "\n")

dataset[:5]

Transformed dataset: Dataset({
    features: ['id', 'text'],
    num_rows: 808580
})

{'id': [1, 2, 3, 4, 5],
 'text': ['What is the step by step guide to invest in share market in india?',
  'What is the step by step guide to invest in share market?',
  'What is the story of Kohinoor (Koh-i-Noor) Diamond?',
  'What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?',
  'How can I increase the speed of my internet connection while using a VPN?']}


# Create a Python set to keep track of processed questions
seen = set()

# Remove rows with exactly the same text value
def remove_duplicate_rows(batch):
    global seen

    output = {"id": [], "text": []}

    for id, text in zip(batch["id"], batch["text"]):
        if text not in seen:
            seen.add(text)
            output["id"].append(id)
            output["text"].append(text)

    return output

# Run the "remove_duplicate_rows" function for batches of rows in the dataset
dataset = dataset.map(
    remove_duplicate_rows,
    batched=True,
    batch_size=1000,
    remove_columns=dataset.column_names,
    desc="Remove Duplicates",
)

dataset

Remove Duplicates:   0%|          | 0/808580 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'text'],
    num_rows: 537362
})


import time

%env TOKENIZERS_PARALLELISM=true

# Compute embeddings for batches of question text
def compute_embeddings(batch):
    return { "embeddings": model.encode(sentences=batch["text"]) }

try:
    start = time.perf_counter()
    dataset = dataset.map(
        compute_embeddings,
        batched=True,
        batch_size=1000,
        desc="Compute Embeddings",
    )
except KeyboardInterrupt:
    print("Creating text embeddings interrupted by the user...")

print(
    "Dataset with embeddings:", dataset,
    f"(duration: {humanize.precisedelta(time.perf_counter() - start)})",
    "\n")

# Print a sample of the embeddings for first question
print(list(dataset[:1]["embeddings"][0][:5]) + ["..."])

env: TOKENIZERS_PARALLELISM=true

Compute Embeddings:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset with embeddings: Dataset({
    features: ['id', 'text', 'embeddings'],
    num_rows: 1000
}) (duration: 25 minutes 8 seconds) 

[0.06814990937709808, -0.03966414928436279, -0.06096725910902023, 0.007466054521501064, -0.0587276816368103, '...']


import os
from elasticsearch import Elasticsearch

INDEX_NAME = "quora-with-embeddings-v1"
es = Elasticsearch(hosts=os.getenv("ELASTICSEARCH_URL"), request_timeout=300)

if not es.indices.exists(index=INDEX_NAME):
    es.indices.create(
        index=INDEX_NAME,
        mappings={
            "properties": {
                "text": {
                    "type": "text",
                    "analyzer": "english",
                },
                "embeddings": {
                    "type": "dense_vector",
                    "dims": model.get_sentence_embedding_dimension(),
                    "index": "true",
                    "similarity": "cosine",
                },
            }
        },
    )

    print(f"Created Elasticsearch index at {os.getenv('ELASTICSEARCH_URL')}/{INDEX_NAME}?pretty")
else:
    print(f"Skipping index creation, index already exists")

Created Elasticsearch index at http://localhost:9200/quora-with-embeddings-v1?pretty


import os
import time
from elasticsearch.helpers import parallel_bulk

if es.count(index=INDEX_NAME)["count"] >= len(dataset):
    print("Skipping indexing, data already indexed.")
else:
    progress = notebook_tqdm(unit="docs", total=len(dataset))
    indexed = 0
    start = time.perf_counter()

    # Remove the "id" column and convert the dataset to generator
    iterable_dataset = dataset.remove_columns(["id"]).to_iterable_dataset()

    try:
        print(f"Indexing dataset to [{INDEX_NAME}]...")

        for ok, result in parallel_bulk(
            es,
            iterable_dataset,
            index=INDEX_NAME,
            thread_count=os.cpu_count()//2,
        ):
            indexed += 1
            progress.update(1)
        print(f"Indexed [{humanize.intcomma(indexed)}] documents in {humanize.precisedelta(time.perf_counter() - start)}")
    except KeyboardInterrupt:
        print(f"Indexing interrupted by the user, indexed [{humanize.intcomma(indexed)}] documents in {humanize.precisedelta(time.perf_counter() - start)}")

Indexing dataset to [quora-with-embeddings-v1]...

  0%|          | 0/537362 [00:00<?, ?docs/s]

Indexed [537,362] documents in 5 minutes and 54 seconds


res = es.cat.indices(index=INDEX_NAME, format="json")
print(
    f"Index [{INDEX_NAME}] contains [{humanize.intcomma(res.body[0]['docs.count'])}] documents",
    f"and uses [{res.body[0]['pri.store.size'].upper()}] of disk space"
)

Index [quora-with-embeddings-v2] contains [537,362] documents and uses [3.9GB] of disk space


import pandas as pd

# Lexical search with the `match` query
def search_keywords(query, size=10):
    res = es.search(
        index=INDEX_NAME,
        query={"match": {"text": query}},
        size=size,
        source_includes=["text", "embeddings"],
    )

    return pd.DataFrame(
        [
            {"text": hit["_source"]["text"], "embeddings": hit["_source"]["embeddings"], "score": hit["_score"]}
            for hit in res["hits"]["hits"]
        ]
    )

# Semantic search with the `knn` option
# https://www.elastic.co/guide/en/elasticsearch/reference/current/search-search.html#search-api-knn
def search_embeddings(query, size=10):
    res = es.search(
        index=INDEX_NAME,
        knn={
            "field": "embeddings",
            "query_vector": model.encode(query, normalize_embeddings=True),
            "k": size,
            "num_candidates": 1000,
        },
        size=size,
        source_includes=["text", "embeddings"],
    )

    return pd.DataFrame(
        [
            {"text": hit["_source"]["text"], "embeddings": hit["_source"]["embeddings"], "score": hit["_score"]}
            for hit in res["hits"]["hits"]
        ]
    )

# Returns the dataframe without the "embeddings" column and with a formatted "score" column
def styled(df):
    return (df[["score", "text"]]
        .style
        .set_table_styles([dict(selector="th,td", props=[("text-align", "left")])])
        .hide(axis="index")
        .format({"score": "{:.3f}"})
        .background_gradient(subset=["score"], cmap="Greys"))

# Add the utility function to the dataframe class
pd.DataFrame.styled = styled


search_keywords("Which city has the highest population in the world?").styled()


search_embeddings("Which city has the highest population in the world?").styled()


search_embeddings("Urban locations with the highest concentration of homo sapiens").styled()


search_keywords("Urban locations with the highest concentration of homo sapiens").styled()


df = search_embeddings("What is the most populated city in the world?")
df


# Store the original index values (0-9) as position
df["position"] = [list(range(len(df.embeddings[i]))) for i in df.index]

# Convert the `embeddings` and `position` columns from "wide" to "long" format
source = df.explode(["embeddings", "position"])

# Rename the `embeddings` column to `embedding`
source = source.rename(columns={ "embeddings": "embedding"})

source


import altair as alt

alt.Chart(
    source
).encode(
    alt.X("position:N", title="").axis(labels=False, ticks=False),
    alt.Y("text:N", title="", sort=source["score"].unique()).axis(labelLimit=300, tickWidth=0, labelFontWeight="bold"),
    alt.Color("embedding:Q").scale(scheme="goldred").legend(None),
).mark_rect(
    width=3
).properties(width=alt.Step(3), height=alt.Step(25))


import numpy as np
from sklearn.decomposition import PCA

# Initialize the PCA reducer to convert embeddings into arrays of length of 2
reducer = PCA(n_components=2)

# Drop the `position` column as it's no longer needed
df.drop(columns=["position"], inplace=True, errors="ignore")

# Convert embeddings to a 2D array and display their shape
embeddings = np.stack(df["embeddings"].to_numpy())
print("Embeddings shape:", embeddings.shape)

# Reduce the embeddings, store them in a new dataframe column and display their shape
df["reduced"] = reducer.fit_transform(np.stack(df["embeddings"])).tolist()
print("Reduced embeddings shape:", np.stack(df["reduced"]).shape)

df

Embeddings shape: (10, 384)
Reduced embeddings shape: (10, 2)


source = pd.DataFrame({
    "text": df["text"],
    "x": df["reduced"].apply(lambda x: x[0]).to_list(),
    "y": df["reduced"].apply(lambda x: x[1]).to_list(),
})

scatterplot(source, tooltips=True)


results_lexical = search_keywords("How can I be a good geologist?", size=20)
results_lexical.styled()


results_semantic = search_embeddings("How can I be a good geologist?", size=20)
results_semantic.styled()


%%HTML
<iframe id="atlas-map-quora" src="about:blank" width="100%" height="400"></iframe>
<script type="text/javascript">
  window.onscroll = function(e) { // Lazy-load the iframe
    const iframe = document.querySelector("#atlas-map-quora");
    const rect = iframe.getBoundingClientRect();
    if (rect.top < window.innerHeight && iframe.src === "about:blank") { iframe.src = "https://atlas.nomic.ai/map/quora-embeddings"; }
  }
</script>

	embedding
0	0.037330
1	0.051162
2	-0.000306
3	0.060210
4	-0.117494
...	...
379	0.053080
380	0.159662
381	0.061269
382	0.060815
383	0.049280

	embeddings	reduced
cat	[0.03733039, 0.0511619, -0.00030606816, 0.0602...	[-0.43801048398017883, -0.26243406534194946]
dog	[-0.05314704, 0.014194381, 0.0071458234, 0.068...	[-0.4829082787036896, -0.2821101248264313]
table	[0.031432427, 0.0013003134, -0.091571234, 0.01...	[0.21471957862377167, -0.3291183412075043]
chair	[-0.00027702627, 0.05202283, -0.054319937, 0.0...	[0.20435161888599396, -0.1891215443611145]
pizza	[-0.08696939, 0.06991054, -0.0150973685, 0.096...	[-0.1220669075846672, 0.525505006313324]
pasta	[-0.07682365, -0.011072064, -0.022458978, 0.08...	[-0.0810772180557251, 0.5807340145111084]
asymptomatic	[0.031974357, 0.020842418, -0.064985596, 0.171...	[0.7049920558929443, -0.0434550903737545]

score	text
26.896	Which city in India has the highest percentage of youth population?
23.894	Which is the most populated city in the world.?
23.263	Which city has the highest number of historic buildings/monuments in the world?
22.259	Which state has highest Brahmin population in India?
22.259	Which Indian state has the highest population density?
21.151	Which place has the highest Asian Indian population in the USA?
20.944	Which city in India has a large Parsi population?
20.905	Which city in India has the highest number of pubs?
20.905	Which city in India has the highest standard of living?
20.380	What are the most populated cities in the world?

score	text
0.941	Which is the most populated city in the world.?
0.939	What is the most populated city in the world?
0.921	What are the most populated cities in the world?
0.907	Which is the largest city in the world?
0.898	Which is biggest city in the world?
0.893	What's the world's largest city?
0.893	Which country has the most population?
0.883	What country has the biggest population?
0.873	Which is the most populated country?
0.871	Which is the largest city in the world by area?

score	text
0.805	What are the most populated cities in the world?
0.795	What are the world's most advanced cities?
0.794	What is the most populated city in the world?
0.790	Which is the most populated city in the world.?
0.785	What is the most isolated city in the world, with over a million metro area inhabitants?
0.781	What is the most visited place ever (largest number of people occupying the very same space throughout history)?
0.778	What are 5 scientific challenges faced by big cities due to urbanization?
0.775	Which is the loneliest place on earth inhabited by less than 5 persons?
0.770	What are the world's most technologically advanced cities?
0.770	How much homo sapiens lived and died on planet earth?

Searching for meaning¶

Saussure's Theory of Language¶

Lexical and Semantic Search¶

Semantic Search with Elasticsearch¶

Text Embeddings¶

Loading the Dataset¶

Indexing the Data¶

Searching the Data¶

The Power of Semantic Search¶

Resources and Further Reading¶

Thanks and Acknowledgments¶

	animal	embedding	position	color
0	cat	0.03733	0	1
1	cat	0.051162	1	1
2	cat	-0.000306	2	1
3	cat	0.06021	3	1
4	cat	-0.117494	4	1
...	...	...	...	...
763	dog	0.03667	379	2
764	dog	0.111445	380	2
765	dog	0.029857	381	2
766	dog	0.023905	382	2
767	dog	0.110093	383	2

	embeddings	reduced
cat	[0.03733039, 0.0511619, -0.00030606816, 0.0602...	[-0.41192373633384705, 3.2534185123722636e-08]
dog	[-0.05314704, 0.014194381, 0.0071458234, 0.068...	[0.4119238257408142, 3.253417801829528e-08]

score	text
30.746	When will "Homo sapiens sapiens" evolves in "Homo sapiens sapiens sapiens"?
29.510	How different are Homo sapien sapiens from Homo sapiens?
29.510	What's the difference between homo sapiens and homo sapiens sapiens?
28.661	How are Homo sapiens idaltu different from Homo sapiens sapiens?
26.487	Why do they classify the modern man as Homo sapiens sapiens? What makes a being a Homo sapiens sapiens?
25.179	What evolved into Homo sapiens?
22.394	Is it possible that there is another planet with homo sapien humans?
22.394	How did homo sapiens evolve consciousness?
21.220	How do we define humans/Homo sapiens?
21.220	Why are Homo sapiens and Neanderthals considered to be separate species?

	text	embeddings	score
0	What is the most populated city in the world?	[0.13103964924812317, 0.016204705461859703, -0...	1.000000
1	Which is the most populated city in the world.?	[0.14043977856636047, 0.024480342864990234, -0...	0.990207
2	What are the most populated cities in the world?	[0.1380167156457901, -0.0007771807722747326, -...	0.986257
3	Which is the most populated country?	[0.14331300556659698, 0.005592127796262503, -0...	0.918946
4	Which is the largest city in the world?	[0.12255951017141342, 0.03331490233540535, -0....	0.898932
5	Which is biggest city in the world?	[0.12018731981515884, 0.04687153548002243, -0....	0.894474
6	What's the world's largest city?	[0.12429719418287277, 0.010167860426008701, -0...	0.893242
7	What is the biggest city in the world by area?	[0.1517365276813507, 0.03082786500453949, -0.0...	0.885677
8	Which is the largest city in the world by area?	[0.1522066444158554, 0.023621227592229843, -0....	0.881623
9	What is the most isolated city in the world, w...	[0.10091247409582138, -0.011301622726023197, -...	0.881128

	text	embedding	score	position
0	What is the most populated city in the world?	0.13104	1.000000	0
0	What is the most populated city in the world?	0.016205	1.000000	1
0	What is the most populated city in the world?	-0.04909	1.000000	2
0	What is the most populated city in the world?	0.069318	1.000000	3
0	What is the most populated city in the world?	-0.02237	1.000000	4
...	...	...	...	...
9	What is the most isolated city in the world, w...	0.009746	0.881128	379
9	What is the most isolated city in the world, w...	0.011482	0.881128	380
9	What is the most isolated city in the world, w...	-0.004106	0.881128	381
9	What is the most isolated city in the world, w...	-0.031268	0.881128	382
9	What is the most isolated city in the world, w...	0.081563	0.881128	383

score	text
23.077	How can I be a good geologist?
14.648	How do geologists predict earthquakes?
14.540	How can a mid career geologist become a civil engineer?
14.043	What should I do to be a great geologist?
13.838	How do geologists classify crystal structures?
13.785	What do geologists do?
13.785	Does Google hire geologist?
13.223	How can a petroleum geologist describe "drilling on structure" and "drilling off structure"?
10.124	What knowledge should I have to be a geologist in nasa or isro or some other space research organisations?
10.099	How I can be good at handwriting?
10.099	How can I be a good consultant?
10.099	How can I be a good engineer?
10.099	How can I be good engineer?
10.099	How can I be good at math?
10.099	How can I be a good copywriter?
10.099	How can I be a good translator?
10.099	How can I be a good programmer?
10.099	How can I be good in programming?
10.099	How can I be a good father?
10.099	How can I be good at sex?

score	text
1.000	How can I be a good geologist?
0.968	What should I do to be a great geologist?
0.874	What do geologists do?
0.862	How can a mid career geologist become a civil engineer?
0.862	What knowledge should I have to be a geologist in nasa or isro or some other space research organisations?
0.841	Where is a good place to study geology?
0.822	What can one do to get accepted into a top geology undergrad college?
0.811	How can I be a great scientist?
0.811	What will be my future if I take Bsc Geology?
0.810	What are common required and elective courses in geology?
0.806	How do I become an archaeologist?
0.801	What kind of first jobs do geology majors tend to get?
0.799	How could degree programs in geology prepare students better for careers?
0.798	What are some interesting areas of geology that I could do undergraduate honors thesis in?
0.796	What will you learn in an Earth science class?
0.795	How can I become a good physicist?
0.792	How can one be a good physicist?
0.787	How can I become both a good scientist and a good engineer?
0.787	What is the best way to pursue masters in geology at a very good university with a graduate degree in some other discipline like computer science?
0.786	How can I become a scientist?